Squashed 'third_party/blasfeo/' content from commit 2a828ca
Change-Id: If1c3caa4799b2d4eb287ef83fa17043587ef07a3
git-subtree-dir: third_party/blasfeo
git-subtree-split: 2a828ca5442108c4c58e4b42b061a0469043f6ea
diff --git a/kernel/Makefile b/kernel/Makefile
new file mode 100644
index 0000000..60e1f31
--- /dev/null
+++ b/kernel/Makefile
@@ -0,0 +1,75 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../Makefile.rule
+
+obj:
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+ ( cd avx2; $(MAKE) obj)
+ ( cd avx; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+ ( cd avx; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+ ( cd sse3; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+ ( cd fma; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+ ( cd armv8a; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+ ( cd armv7a; $(MAKE) obj)
+ ( cd c99; $(MAKE) obj)
+endif
+
+ifeq ($(TARGET), GENERIC)
+ ( cd c99; $(MAKE) obj)
+endif
+
+clean:
+ make -C avx2 clean
+ make -C avx clean
+ make -C sse3 clean
+ make -C fma clean
+ make -C armv8a clean
+ make -C armv7a clean
+ make -C c99 clean
+
diff --git a/kernel/armv7a/Makefile b/kernel/armv7a/Makefile
new file mode 100644
index 0000000..4cb59a7
--- /dev/null
+++ b/kernel/armv7a/Makefile
@@ -0,0 +1,49 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += kernel_dgemm_4x4_lib4.o
+OBJS += kernel_sgemm_12x4_lib4.o kernel_sgemm_8x4_lib4.o kernel_sgemm_4x4_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/armv7a/kernel_dgemm_4x4_lib4.S b/kernel/armv7a/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..86aee4f
--- /dev/null
+++ b/kernel/armv7a/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,3223 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, %function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A even
+ fldd d16, [r5, #0]
+ fldd d17, [r5, #8]
+ fldd d18, [r5, #16]
+ fldd d19, [r5, #24]
+
+ // preload B even
+ fldd d20, [r6, #0]
+ fldd d21, [r6, #8]
+ fldd d22, [r6, #16]
+ fldd d23, [r6, #24]
+
+ // preload A odd
+ fldd d24, [r5, #32]
+ fldd d25, [r5, #40]
+ fldd d26, [r5, #48]
+ fldd d27, [r5, #56]
+
+ // preload B odd
+ fldd d28, [r6, #32]
+ fldd d29, [r6, #40]
+ fldd d30, [r6, #48]
+ fldd d31, [r6, #56]
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, #64]
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // unroll 0
+ fmacd d0, d16, d20
+ pld [r5, #128] // prefetch
+ fmacd d1, d17, d20
+ pld [r6, #128] // prefetch
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fmacd d12, d16, d23
+ fldd d16, [r5, #64] // A
+ fmacd d13, d17, d23
+ fldd d17, [r5, #72] // A
+ fmacd d14, d18, d23
+ fldd d18, [r5, #80] // A
+ fmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fmacd d12, d24, d31
+ fldd d24, [r5, #96] // A
+ fmacd d13, d25, d31
+ fldd d25, [r5, #104] // A
+ fmacd d14, d26, d31
+ fldd d26, [r5, #112] // A
+ fmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+
+
+ // unroll 2
+ fmacd d0, d16, d20
+ pld [r6, #192] // prefetch
+ fmacd d1, d17, d20
+ add r6, r6, #128
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #0] // B
+
+ fmacd d4, d16, d21
+ pld [r5, #192] // prefetch
+ fmacd d5, d17, d21
+ add r5, r5, #128
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #8] // B
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+ fldd d22, [r6, #16] // B
+
+ fmacd d12, d16, d23
+ fldd d16, [r5, #0] // A
+ fmacd d13, d17, d23
+ fldd d17, [r5, #8] // A
+ fmacd d14, d18, d23
+ fldd d18, [r5, #16] // A
+ fmacd d15, d19, d23
+ fldd d19, [r5, #24] // A
+ fldd d23, [r6, #24] // B
+
+ // unroll 3
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #32] // B
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #40] // B
+
+ fmacd d8, d24, d30
+ sub r4, r4, #4
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+ fldd d30, [r6, #48] // B
+
+ fmacd d12, d24, d31
+ fldd d24, [r5, #32] // A
+ fmacd d13, d25, d31
+ fldd d25, [r5, #40] // A
+ fmacd d14, d26, d31
+ fldd d26, [r5, #48] // A
+ fmacd d15, d27, d31
+ fldd d27, [r5, #56] // A
+ fldd d31, [r6, #56] // B
+
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fmacd d12, d16, d23
+ fldd d16, [r5, #64] // A
+ fmacd d13, d17, d23
+ fldd d17, [r5, #72] // A
+ fmacd d14, d18, d23
+ fldd d18, [r5, #80] // A
+ fmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fmacd d12, d24, d31
+ fldd d24, [r5, #96] // A
+ fmacd d13, d25, d31
+ fldd d25, [r5, #104] // A
+ fmacd d14, d26, d31
+ fldd d26, [r5, #112] // A
+ fmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fmacd d12, d16, d23
+ fmacd d13, d17, d23
+ fmacd d14, d18, d23
+ fmacd d15, d19, d23
+
+ // unroll 3
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+
+ fmacd d12, d24, d31
+ fmacd d13, d25, d31
+ fmacd d14, d26, d31
+ fmacd d15, d27, d31
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ fldd d16, [r5, #0] // A
+ fldd d17, [r5, #8] // A
+ fldd d18, [r5, #16] // A
+ fldd d19, [r5, #24] // A
+
+ fldd d20, [r6, #0] // B
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fldd d21, [r6, #8] // B
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fldd d22, [r6, #16] // B
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fldd d23, [r6, #24] // B
+ fmacd d12, d16, d23
+ fmacd d13, d17, d23
+ fmacd d14, d18, d23
+ fmacd d15, d19, d23
+
+ add r5, r5, #32
+ add r6, r6, #32
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+// r7 <- 4*sdb*sizeof(double)
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x4_lib4, %function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+ pld [r6, #64]
+
+ // preload A even
+ fldd d16, [r5, #0]
+ fldd d17, [r5, #8]
+ fldd d18, [r5, #16]
+ fldd d19, [r5, #24]
+
+ // preload B even
+ fldd d20, [r6, #0]
+ fldd d21, [r6, #32]
+ fldd d22, [r6, #64]
+ fldd d23, [r6, #96]
+
+ // preload A odd
+ fldd d24, [r5, #32]
+ fldd d25, [r5, #40]
+ fldd d26, [r5, #48]
+ fldd d27, [r5, #56]
+
+ // preload B odd
+ fldd d28, [r6, #8]
+ fldd d29, [r6, #40]
+ fldd d30, [r6, #72]
+ fldd d31, [r6, #104]
+
+ // prefetch
+ pld [r5, #64]
+
+ // B next
+ add r9, r7, r6
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // unroll 0
+ fmacd d0, d16, d20
+ pld [r5, #128] // prefetch
+ fmacd d1, d17, d20
+ pld [r9, #0]
+ fmacd d2, d18, d20
+ pld [r9, #64]
+ fmacd d3, d19, d20
+ fldd d20, [r6, #16] // B
+
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #48] // B
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fmacd d12, d16, d23
+ fldd d16, [r5, #64] // A
+ fmacd d13, d17, d23
+ fldd d17, [r5, #72] // A
+ fmacd d14, d18, d23
+ fldd d18, [r5, #80] // A
+ fmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #112] // B
+
+ // unroll 1
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #24] // B
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #56] // B
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+ fldd d30, [r6, #88] // B
+
+ fmacd d12, d24, d31
+ fldd d24, [r5, #96] // A
+ fmacd d13, d25, d31
+ fldd d25, [r5, #104] // A
+ fmacd d14, d26, d31
+ fldd d26, [r5, #112] // A
+ fmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ // unroll 2
+ fmacd d0, d16, d20
+ pld [r5, #192] // prefetch
+ fmacd d1, d17, d20
+ mov r6, r9
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #0] // B
+
+ fmacd d4, d16, d21
+ add r5, r5, #128
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #32] // B
+
+ fmacd d8, d16, d22
+ add r9, r9, r7
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+ fldd d22, [r6, #64] // B
+
+ fmacd d12, d16, d23
+ fldd d16, [r5, #0] // A
+ fmacd d13, d17, d23
+ fldd d17, [r5, #8] // A
+ fmacd d14, d18, d23
+ fldd d18, [r5, #16] // A
+ fmacd d15, d19, d23
+ fldd d19, [r5, #24] // A
+ fldd d23, [r6, #96] // B
+
+ // unroll 3
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #8] // B
+
+ fmacd d4, d24, d29
+ sub r4, r4, #4
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #40] // B
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+ fldd d30, [r6, #72] // B
+
+ fmacd d12, d24, d31
+ fldd d24, [r5, #32] // A
+ fmacd d13, d25, d31
+ fldd d25, [r5, #40] // A
+ fmacd d14, d26, d31
+ fldd d26, [r5, #48] // A
+ fmacd d15, d27, d31
+ fldd d27, [r5, #56] // A
+ fldd d31, [r6, #104] // B
+
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #16] // B
+
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #48] // B
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fmacd d12, d16, d23
+ fldd d16, [r5, #64] // A
+ fmacd d13, d17, d23
+ fldd d17, [r5, #72] // A
+ fmacd d14, d18, d23
+ fldd d18, [r5, #80] // A
+ fmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #112] // B
+
+ // unroll 1
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #24] // B
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #56] // B
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+ fldd d30, [r6, #88] // B
+
+ fmacd d12, d24, d31
+ fldd d24, [r5, #96] // A
+ fmacd d13, d25, d31
+ fldd d25, [r5, #104] // A
+ fmacd d14, d26, d31
+ fldd d26, [r5, #112] // A
+ fmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ add r5, r5, #128
+ mov r6, r9
+
+ // unroll 2
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fmacd d12, d16, d23
+ fmacd d13, d17, d23
+ fmacd d14, d18, d23
+ fmacd d15, d19, d23
+
+ // unroll 3
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+
+ fmacd d4, d24, d29
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+
+ fmacd d8, d24, d30
+ fmacd d9, d25, d30
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+
+ fmacd d12, d24, d31
+ fmacd d13, d25, d31
+ fmacd d14, d26, d31
+ fmacd d15, d27, d31
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ fldd d16, [r5, #0] // A
+ fldd d17, [r5, #8] // A
+ fldd d18, [r5, #16] // A
+ fldd d19, [r5, #24] // A
+
+ fldd d20, [r6, #0] // B
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fldd d21, [r6, #32] // B
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fldd d22, [r6, #64] // B
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fldd d23, [r6, #96] // B
+ fmacd d12, d16, d23
+ fmacd d13, d17, d23
+ fmacd d14, d18, d23
+ fmacd d15, d19, d23
+
+ add r5, r5, #32
+ add r6, r6, #8
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DSYRK_L_ADD_NT_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dsyrk_l_add_nt_4x4_lib4, %function
+inner_kernel_dsyrk_l_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dsyrk_l_add_nt_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A even
+ fldd d16, [r5, #0]
+ fldd d17, [r5, #8]
+ fldd d18, [r5, #16]
+ fldd d19, [r5, #24]
+
+ // preload B even
+ fldd d20, [r6, #0]
+ fldd d21, [r6, #8]
+ fldd d22, [r6, #16]
+ fldd d23, [r6, #24]
+
+ // preload A odd
+ fldd d24, [r5, #32]
+ fldd d25, [r5, #40]
+ fldd d26, [r5, #48]
+ fldd d27, [r5, #56]
+
+ // preload B odd
+ fldd d28, [r6, #32]
+ fldd d29, [r6, #40]
+ fldd d30, [r6, #48]
+ fldd d31, [r6, #56]
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, #64]
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // prefetch
+ pld [r5, #128]
+ pld [r6, #128]
+
+ // unroll 0
+ fmacd d0, d16, d20
+ fldd d16, [r5, #64] // A
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fmacd d5, d17, d21
+ fldd d17, [r5, #72] // A
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fmacd d10, d18, d22
+ fldd d18, [r5, #80] // A
+ fmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fmacd d0, d24, d28
+ fldd d24, [r5, #96] // A
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fmacd d5, d25, d29
+ fldd d25, [r5, #104] // A
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fmacd d10, d26, d30
+ fldd d26, [r5, #112] // A
+ fmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ // prefetch
+ pld [r5, #192]
+ pld [r6, #192]
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fmacd d0, d16, d20
+ fldd d16, [r5, #0] // A
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #0] // B
+
+ fmacd d5, d17, d21
+ fldd d17, [r5, #8] // A
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #8] // B
+
+ fmacd d10, d18, d22
+ fldd d18, [r5, #16] // A
+ fmacd d11, d19, d22
+ fldd d22, [r6, #16] // B
+
+ fmacd d15, d19, d23
+ fldd d19, [r5, #24] // A
+ fldd d23, [r6, #24] // B
+
+ // unroll 3
+ fmacd d0, d24, d28
+ fldd d24, [r5, #32] // A
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #32] // B
+
+ fmacd d5, d25, d29
+ fldd d25, [r5, #40] // A
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #40] // B
+
+ fmacd d10, d26, d30
+ fldd d26, [r5, #48] // A
+ fmacd d11, d27, d30
+ fldd d30, [r6, #48] // B
+
+ fmacd d15, d27, d31
+ fldd d27, [r5, #56] // A
+ fldd d31, [r6, #56] // B
+
+ sub r4, r4, #4
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ fmacd d0, d16, d20
+ fldd d16, [r5, #64] // A
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fmacd d5, d17, d21
+ fldd d17, [r5, #72] // A
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fmacd d10, d18, d22
+ fldd d18, [r5, #80] // A
+ fmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fmacd d0, d24, d28
+ fldd d24, [r5, #96] // A
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fmacd d5, d25, d29
+ fldd d25, [r5, #104] // A
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fmacd d10, d26, d30
+ fldd d26, [r5, #112] // A
+ fmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fmacd d15, d19, d23
+
+ // unroll 3
+ fmacd d0, d24, d28
+ fmacd d1, d25, d28
+ fmacd d2, d26, d28
+ fmacd d3, d27, d28
+
+ fmacd d5, d25, d29
+ fmacd d6, d26, d29
+ fmacd d7, d27, d29
+
+ fmacd d10, d26, d30
+ fmacd d11, d27, d30
+
+ fmacd d15, d27, d31
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ fldd d16, [r5, #0] // A
+ fldd d17, [r5, #8] // A
+ fldd d18, [r5, #16] // A
+ fldd d19, [r5, #24] // A
+
+ fldd d20, [r6, #0] // B
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fldd d21, [r6, #8] // B
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fldd d22, [r6, #16] // B
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fldd d23, [r6, #24] // B
+ fmacd d15, d19, d23
+
+ add r5, r5, #32
+ add r6, r6, #32
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dsyrk_l_add_nt_4x4_lib4, .-inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_4x4_lib4, %function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A even
+ fldd d16, [r5, #0]
+ fldd d17, [r5, #8]
+ fldd d18, [r5, #16]
+ fldd d19, [r5, #24]
+
+ // preload B even
+ fldd d20, [r6, #0]
+ fldd d21, [r6, #8]
+ fldd d22, [r6, #16]
+ fldd d23, [r6, #24]
+
+ // preload A odd
+ fldd d24, [r5, #32]
+ fldd d25, [r5, #40]
+ fldd d26, [r5, #48]
+ fldd d27, [r5, #56]
+
+ // preload B odd
+ fldd d28, [r6, #32]
+ fldd d29, [r6, #40]
+ fldd d30, [r6, #48]
+ fldd d31, [r6, #56]
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, #64]
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // prefetch
+ pld [r5, #128]
+ pld [r6, #128]
+
+ // unroll 0
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fnmacd d4, d16, d21
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fnmacd d8, d16, d22
+ fnmacd d9, d17, d22
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fnmacd d12, d16, d23
+ fldd d16, [r5, #64] // A
+ fnmacd d13, d17, d23
+ fldd d17, [r5, #72] // A
+ fnmacd d14, d18, d23
+ fldd d18, [r5, #80] // A
+ fnmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fnmacd d0, d24, d28
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fnmacd d4, d24, d29
+ fnmacd d5, d25, d29
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fnmacd d8, d24, d30
+ fnmacd d9, d25, d30
+ fnmacd d10, d26, d30
+ fnmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fnmacd d12, d24, d31
+ fldd d24, [r5, #96] // A
+ fnmacd d13, d25, d31
+ fldd d25, [r5, #104] // A
+ fnmacd d14, d26, d31
+ fldd d26, [r5, #112] // A
+ fnmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ // prefetch
+ pld [r5, #192]
+ pld [r6, #192]
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+ fldd d20, [r6, #0] // B
+
+ fnmacd d4, d16, d21
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+ fldd d21, [r6, #8] // B
+
+ fnmacd d8, d16, d22
+ fnmacd d9, d17, d22
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+ fldd d22, [r6, #16] // B
+
+ fnmacd d12, d16, d23
+ fldd d16, [r5, #0] // A
+ fnmacd d13, d17, d23
+ fldd d17, [r5, #8] // A
+ fnmacd d14, d18, d23
+ fldd d18, [r5, #16] // A
+ fnmacd d15, d19, d23
+ fldd d19, [r5, #24] // A
+ fldd d23, [r6, #24] // B
+
+ // unroll 3
+ fnmacd d0, d24, d28
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+ fldd d28, [r6, #32] // B
+
+ fnmacd d4, d24, d29
+ fnmacd d5, d25, d29
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+ fldd d29, [r6, #40] // B
+
+ fnmacd d8, d24, d30
+ fnmacd d9, d25, d30
+ fnmacd d10, d26, d30
+ fnmacd d11, d27, d30
+ fldd d30, [r6, #48] // B
+
+ fnmacd d12, d24, d31
+ fldd d24, [r5, #32] // A
+ fnmacd d13, d25, d31
+ fldd d25, [r5, #40] // A
+ fnmacd d14, d26, d31
+ fldd d26, [r5, #48] // A
+ fnmacd d15, d27, d31
+ fldd d27, [r5, #56] // A
+ fldd d31, [r6, #56] // B
+
+ sub r4, r4, #4
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fnmacd d4, d16, d21
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fnmacd d8, d16, d22
+ fnmacd d9, d17, d22
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fnmacd d12, d16, d23
+ fldd d16, [r5, #64] // A
+ fnmacd d13, d17, d23
+ fldd d17, [r5, #72] // A
+ fnmacd d14, d18, d23
+ fldd d18, [r5, #80] // A
+ fnmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fnmacd d0, d24, d28
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fnmacd d4, d24, d29
+ fnmacd d5, d25, d29
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fnmacd d8, d24, d30
+ fnmacd d9, d25, d30
+ fnmacd d10, d26, d30
+ fnmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fnmacd d12, d24, d31
+ fldd d24, [r5, #96] // A
+ fnmacd d13, d25, d31
+ fldd d25, [r5, #104] // A
+ fnmacd d14, d26, d31
+ fldd d26, [r5, #112] // A
+ fnmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+
+ fnmacd d4, d16, d21
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+
+ fnmacd d8, d16, d22
+ fnmacd d9, d17, d22
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+
+ fnmacd d12, d16, d23
+ fnmacd d13, d17, d23
+ fnmacd d14, d18, d23
+ fnmacd d15, d19, d23
+
+ // unroll 3
+ fnmacd d0, d24, d28
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+
+ fnmacd d4, d24, d29
+ fnmacd d5, d25, d29
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+
+ fnmacd d8, d24, d30
+ fnmacd d9, d25, d30
+ fnmacd d10, d26, d30
+ fnmacd d11, d27, d30
+
+ fnmacd d12, d24, d31
+ fnmacd d13, d25, d31
+ fnmacd d14, d26, d31
+ fnmacd d15, d27, d31
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ fldd d16, [r5, #0] // A
+ fldd d17, [r5, #8] // A
+ fldd d18, [r5, #16] // A
+ fldd d19, [r5, #24] // A
+
+ fldd d20, [r6, #0] // B
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+
+ fldd d21, [r6, #8] // B
+ fnmacd d4, d16, d21
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+
+ fldd d22, [r6, #16] // B
+ fnmacd d8, d16, d22
+ fnmacd d9, d17, d22
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+
+ fldd d23, [r6, #24] // B
+ fnmacd d12, d16, d23
+ fnmacd d13, d17, d23
+ fnmacd d14, d18, d23
+ fnmacd d15, d19, d23
+
+ add r5, r5, #32
+ add r6, r6, #32
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DSYRK_L_SUB_NT_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dsyrk_l_sub_nt_4x4_lib4, %function
+inner_kernel_dsyrk_l_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dsyrk_l_sub_nt_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A even
+ fldd d16, [r5, #0]
+ fldd d17, [r5, #8]
+ fldd d18, [r5, #16]
+ fldd d19, [r5, #24]
+
+ // preload B even
+ fldd d20, [r6, #0]
+ fldd d21, [r6, #8]
+ fldd d22, [r6, #16]
+ fldd d23, [r6, #24]
+
+ // preload A odd
+ fldd d24, [r5, #32]
+ fldd d25, [r5, #40]
+ fldd d26, [r5, #48]
+ fldd d27, [r5, #56]
+
+ // preload B odd
+ fldd d28, [r6, #32]
+ fldd d29, [r6, #40]
+ fldd d30, [r6, #48]
+ fldd d31, [r6, #56]
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, #64]
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // prefetch
+ pld [r5, #128]
+ pld [r6, #128]
+
+ // unroll 0
+ fnmacd d0, d16, d20
+ fldd d16, [r5, #64] // A
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fnmacd d5, d17, d21
+ fldd d17, [r5, #72] // A
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fnmacd d10, d18, d22
+ fldd d18, [r5, #80] // A
+ fnmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fnmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fnmacd d0, d24, d28
+ fldd d24, [r5, #96] // A
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fnmacd d5, d25, d29
+ fldd d25, [r5, #104] // A
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fnmacd d10, d26, d30
+ fldd d26, [r5, #112] // A
+ fnmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fnmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ // prefetch
+ pld [r5, #192]
+ pld [r6, #192]
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fnmacd d0, d16, d20
+ fldd d16, [r5, #0] // A
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+ fldd d20, [r6, #0] // B
+
+ fnmacd d5, d17, d21
+ fldd d17, [r5, #8] // A
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+ fldd d21, [r6, #8] // B
+
+ fnmacd d10, d18, d22
+ fldd d18, [r5, #16] // A
+ fnmacd d11, d19, d22
+ fldd d22, [r6, #16] // B
+
+ fnmacd d15, d19, d23
+ fldd d19, [r5, #24] // A
+ fldd d23, [r6, #24] // B
+
+ // unroll 3
+ fnmacd d0, d24, d28
+ fldd d24, [r5, #32] // A
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+ fldd d28, [r6, #32] // B
+
+ fnmacd d5, d25, d29
+ fldd d25, [r5, #40] // A
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+ fldd d29, [r6, #40] // B
+
+ fnmacd d10, d26, d30
+ fldd d26, [r5, #48] // A
+ fnmacd d11, d27, d30
+ fldd d30, [r6, #48] // B
+
+ fnmacd d15, d27, d31
+ fldd d27, [r5, #56] // A
+ fldd d31, [r6, #56] // B
+
+ sub r4, r4, #4
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ fnmacd d0, d16, d20
+ fldd d16, [r5, #64] // A
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+ fldd d20, [r6, #64] // B
+
+ fnmacd d5, d17, d21
+ fldd d17, [r5, #72] // A
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+ fldd d21, [r6, #72] // B
+
+ fnmacd d10, d18, d22
+ fldd d18, [r5, #80] // A
+ fnmacd d11, d19, d22
+ fldd d22, [r6, #80] // B
+
+ fnmacd d15, d19, d23
+ fldd d19, [r5, #88] // A
+ fldd d23, [r6, #88] // B
+
+ // unroll 1
+ fnmacd d0, d24, d28
+ fldd d24, [r5, #96] // A
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+ fldd d28, [r6, #96] // B
+
+ fnmacd d5, d25, d29
+ fldd d25, [r5, #104] // A
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+ fldd d29, [r6, #104] // B
+
+ fnmacd d10, d26, d30
+ fldd d26, [r5, #112] // A
+ fnmacd d11, d27, d30
+ fldd d30, [r6, #112] // B
+
+ fnmacd d15, d27, d31
+ fldd d27, [r5, #120] // A
+ fldd d31, [r6, #120] // B
+
+ add r5, r5, #128
+ add r6, r6, #128
+
+ // unroll 2
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+
+ fnmacd d15, d19, d23
+
+ // unroll 3
+ fnmacd d0, d24, d28
+ fnmacd d1, d25, d28
+ fnmacd d2, d26, d28
+ fnmacd d3, d27, d28
+
+ fnmacd d5, d25, d29
+ fnmacd d6, d26, d29
+ fnmacd d7, d27, d29
+
+ fnmacd d10, d26, d30
+ fnmacd d11, d27, d30
+
+ fnmacd d15, d27, d31
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ fldd d16, [r5, #0] // A
+ fldd d17, [r5, #8] // A
+ fldd d18, [r5, #16] // A
+ fldd d19, [r5, #24] // A
+
+ fldd d20, [r6, #0] // B
+ fnmacd d0, d16, d20
+ fnmacd d1, d17, d20
+ fnmacd d2, d18, d20
+ fnmacd d3, d19, d20
+
+ fldd d21, [r6, #8] // B
+ fnmacd d5, d17, d21
+ fnmacd d6, d18, d21
+ fnmacd d7, d19, d21
+
+ fldd d22, [r6, #16] // B
+ fnmacd d10, d18, d22
+ fnmacd d11, d19, d22
+
+ fldd d23, [r6, #24] // B
+ fnmacd d15, d19, d23
+
+ add r5, r5, #32
+ add r6, r6, #32
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dsyrk_l_sub_nt_4x4_lib4, .-inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+// r7 <- bs*sdb*sizeof(double)
+// r8 <- offsetB
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x4_lib4, %function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmp r8, #0
+ ble 2f // return
+
+ cmp r4, #0
+ ble 2f // return
+
+ rsb r9, r8, #4 // 4-offsetB
+ cmp r9, r4
+// ble 0f
+// mov r9, r4 // kend=min(k,4-offsetB(
+//0:
+ movgt r9, r4 // kend=min(k,4-offsetB(
+
+// lsl r10, r8, #3 // offsetB*sizeof(double)
+ add r6, r6, r8, LSL #3 // B + offsetB*sizeof(double)
+
+1:
+ fldd d16, [r5, #0] // A
+ fldd d17, [r5, #8] // A
+ fldd d18, [r5, #16] // A
+ fldd d19, [r5, #24] // A
+
+ fldd d20, [r6, #0] // B
+ fmacd d0, d16, d20
+ fmacd d1, d17, d20
+ fmacd d2, d18, d20
+ fmacd d3, d19, d20
+
+ fldd d21, [r6, #32] // B
+ fmacd d4, d16, d21
+ fmacd d5, d17, d21
+ fmacd d6, d18, d21
+ fmacd d7, d19, d21
+
+ fldd d22, [r6, #64] // B
+ fmacd d8, d16, d22
+ fmacd d9, d17, d22
+ fmacd d10, d18, d22
+ fmacd d11, d19, d22
+
+ fldd d23, [r6, #96] // B
+ fmacd d12, d16, d23
+ fmacd d13, d17, d23
+ fmacd d14, d18, d23
+ fmacd d15, d19, d23
+
+ sub r4, r4, #1
+ sub r9, r9, #1
+ add r5, r5, #32
+ add r6, r6, #8
+
+ cmp r9, #0
+ bgt 1b
+
+ cmp r4, #0
+ ble 2f // return
+
+ add r6, r6, r7
+ sub r6, r6, #32
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r4 <- E
+// r5 <- inv_diag_E
+//
+// output arguments:
+// r4 <- E
+// r5 <- inv_diag_E
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_lib4, %function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+
+ // first column
+ fldd d16, [r5, #0] // E_inv[0]
+ fmuld d0, d0, d16
+ fmuld d1, d1, d16
+ fmuld d2, d2, d16
+ fmuld d3, d3, d16
+
+ // second column
+ fldd d16, [r4, #8] // E[1+4*0]
+ fnmacd d4, d0, d16
+ fnmacd d5, d1, d16
+ fnmacd d6, d2, d16
+ fnmacd d7, d3, d16
+ fldd d16, [r5, #8] // E_inv[1]
+ fmuld d4, d4, d16
+ fmuld d5, d5, d16
+ fmuld d6, d6, d16
+ fmuld d7, d7, d16
+
+ // third column
+ fldd d16, [r4, #16] // E[2+4*0]
+ fnmacd d8, d0, d16
+ fnmacd d9, d1, d16
+ fnmacd d10, d2, d16
+ fnmacd d11, d3, d16
+ fldd d16, [r4, #48] // E[2+4*1]
+ fnmacd d8, d4, d16
+ fnmacd d9, d5, d16
+ fnmacd d10, d6, d16
+ fnmacd d11, d7, d16
+ fldd d16, [r5, #16] // E_inv[2]
+ fmuld d8, d8, d16
+ fmuld d9, d9, d16
+ fmuld d10, d10, d16
+ fmuld d11, d11, d16
+
+ // fourth column
+ fldd d16, [r4, #24] // E[3+4*0]
+ fnmacd d12, d0, d16
+ fnmacd d13, d1, d16
+ fnmacd d14, d2, d16
+ fnmacd d15, d3, d16
+ fldd d16, [r4, #56] // E[3+4*1]
+ fnmacd d12, d4, d16
+ fnmacd d13, d5, d16
+ fnmacd d14, d6, d16
+ fnmacd d15, d7, d16
+ fldd d16, [r4, #88] // E[3+4*2]
+ fnmacd d12, d8, d16
+ fnmacd d13, d9, d16
+ fnmacd d14, d10, d16
+ fnmacd d15, d11, d16
+ fldd d16, [r5, #24] // E_inv[3]
+ fmuld d12, d12, d16
+ fmuld d13, d13, d16
+ fmuld d14, d14, d16
+ fmuld d15, d15, d16
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// cholesky factorization
+//
+// input arguments:
+// r4 <- inv_diag_D
+//
+// output arguments:
+// r4 <- inv_diag_D
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_lib4, %function
+inner_edge_dpotrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_lib4:
+#endif
+#endif
+
+ fconstd d16, #112 // 1.0
+ fldd d17, .LC01 // 0.0
+
+ // first column
+ fcmped d0, d17
+ fmstat
+ ble 1f
+ fsqrtd d0, d0
+ fdivd d18, d16, d0
+ fstd d18, [r4, #0]
+2:
+ fmuld d1, d1, d18
+ fmuld d2, d2, d18
+ fmuld d3, d3, d18
+
+ // second column
+ fnmacd d5, d1, d1
+ fnmacd d6, d1, d2
+ fnmacd d7, d1, d3
+ fcmped d5, d17
+ fmstat
+ ble 3f
+ fsqrtd d5, d5
+ fdivd d18, d16, d5
+ fstd d18, [r4, #8]
+4:
+ fmuld d6, d6, d18
+ fmuld d7, d7, d18
+
+ // third column
+ fnmacd d10, d2, d2
+ fnmacd d11, d2, d3
+ fnmacd d10, d6, d6
+ fnmacd d11, d6, d7
+ fcmped d10, d17
+ fmstat
+ ble 5f
+ fsqrtd d10, d10
+ fdivd d18, d16, d10
+ fstd d18, [r4, #16]
+6:
+ fmuld d11, d11, d18
+
+ // fourth column
+ fnmacd d15, d3, d3
+ fnmacd d15, d7, d7
+ fnmacd d15, d11, d11
+ fcmped d15, d17
+ fmstat
+ ble 7f
+ fsqrtd d15, d15
+ fdivd d18, d16, d15
+ fstd d18, [r4, #24]
+
+ b 0f
+
+1:
+ fldd d0, .LC01
+ b 2b
+
+3:
+ fldd d5, .LC01
+ b 4b
+
+5:
+ fldd d10, .LC01
+ b 6b
+
+7:
+ fldd d15, .LC01
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_lib4, .-inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+ .align 3
+.LC01: // { 0 }
+ .word 0
+ .word 0
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- alpha
+// r5 <- beta
+// r6 <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ fldd d16, [r4, #0] // alpha
+
+ fmuld d0, d0, d16
+ fldd d18, [r5, #0] // beta
+ fmuld d1, d1, d16
+ fldd d17, .LC01 // 0.0
+ fmuld d2, d2, d16
+ fmuld d3, d3, d16
+
+ fmuld d4, d4, d16
+ fmuld d5, d5, d16
+ fmuld d6, d6, d16
+ fmuld d7, d7, d16
+
+ fmuld d8, d8, d16
+ fcmped d18, d17
+ fmuld d9, d9, d16
+ fmuld d10, d10, d16
+ fmuld d11, d11, d16
+
+ fmuld d12, d12, d16
+ fmstat
+ fmuld d13, d13, d16
+ fmuld d14, d14, d16
+ fmuld d15, d15, d16
+
+ beq 0f // end
+
+ fldd d17, [r6, #0] // C
+ fmacd d0, d18, d17
+ fldd d17, [r6, #8] // C
+ fmacd d1, d18, d17
+ fldd d17, [r6, #16] // C
+ fmacd d2, d18, d17
+ fldd d17, [r6, #24] // C
+ fmacd d3, d18, d17
+
+ fldd d17, [r6, #32] // C
+ fmacd d4, d18, d17
+ fldd d17, [r6, #40] // C
+ fmacd d5, d18, d17
+ fldd d17, [r6, #48] // C
+ fmacd d6, d18, d17
+ fldd d17, [r6, #56] // C
+ fmacd d7, d18, d17
+
+ fldd d17, [r6, #64] // C
+ fmacd d8, d18, d17
+ fldd d17, [r6, #72] // C
+ fmacd d9, d18, d17
+ fldd d17, [r6, #80] // C
+ fmacd d10, d18, d17
+ fldd d17, [r6, #88] // C
+ fmacd d11, d18, d17
+
+ fldd d17, [r6, #96] // C
+ fmacd d12, d18, d17
+ fldd d17, [r6, #104] // C
+ fmacd d13, d18, d17
+ fldd d17, [r6, #112] // C
+ fmacd d14, d18, d17
+ fldd d17, [r6, #120] // C
+ fmacd d15, d18, d17
+
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_11_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_4x4_lib4, %function
+inner_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_4x4_lib4:
+#endif
+#endif
+
+ fldd d17, [r4, #0] // C
+ faddd d0, d0, d17
+ fldd d17, [r4, #8] // C
+ faddd d1, d1, d17
+ fldd d17, [r4, #16] // C
+ faddd d2, d2, d17
+ fldd d17, [r4, #24] // C
+ faddd d3, d3, d17
+
+ fldd d17, [r4, #32] // C
+ faddd d4, d4, d17
+ fldd d17, [r4, #40] // C
+ faddd d5, d5, d17
+ fldd d17, [r4, #48] // C
+ faddd d6, d6, d17
+ fldd d17, [r4, #56] // C
+ faddd d7, d7, d17
+
+ fldd d17, [r4, #64] // C
+ faddd d8, d8, d17
+ fldd d17, [r4, #72] // C
+ faddd d9, d9, d17
+ fldd d17, [r4, #80] // C
+ faddd d10, d10, d17
+ fldd d17, [r4, #88] // C
+ faddd d11, d11, d17
+
+ fldd d17, [r4, #96] // C
+ faddd d12, d12, d17
+ fldd d17, [r4, #104] // C
+ faddd d13, d13, d17
+ fldd d17, [r4, #112] // C
+ faddd d14, d14, d17
+ fldd d17, [r4, #120] // C
+ faddd d15, d15, d17
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_4x4_lib4, .-inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#endif
+#endif
+
+ fstd d0, [r4, #0]
+ fstd d1, [r4, #8]
+ fstd d2, [r4, #16]
+ fstd d3, [r4, #24]
+
+ fstd d4, [r4, #32]
+ fstd d5, [r4, #40]
+ fstd d6, [r4, #48]
+ fstd d7, [r4, #56]
+
+ fstd d8, [r4, #64]
+ fstd d9, [r4, #72]
+ fstd d10, [r4, #80]
+ fstd d11, [r4, #88]
+
+ fstd d12, [r4, #96]
+ fstd d13, [r4, #104]
+ fstd d14, [r4, #112]
+ fstd d15, [r4, #120]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_L_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_lib4, %function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#endif
+#endif
+
+ fstd d0, [r4, #0]
+ fstd d1, [r4, #8]
+ fstd d2, [r4, #16]
+ fstd d3, [r4, #24]
+
+// fstd d4, [r4, #32]
+ fstd d5, [r4, #40]
+ fstd d6, [r4, #48]
+ fstd d7, [r4, #56]
+
+// fstd d8, [r4, #64]
+// fstd d9, [r4, #72]
+ fstd d10, [r4, #80]
+ fstd d11, [r4, #88]
+
+// fstd d12, [r4, #96]
+// fstd d13, [r4, #104]
+// fstd d14, [r4, #112]
+ fstd d15, [r4, #120]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// zero double word
+ .align 3
+.LC00: // { 0 }
+ .word 0
+ .word 0
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8
+// void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, %function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #0] // beta
+ ldr r6, [fp, #4] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #8] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12 sp+16
+// void kernel_dgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_dgemm_nn_4x4_lib4
+ .type kernel_dgemm_nn_4x4_lib4, %function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ ldr r6, [fp, #0] // B
+ ldr r7, [fp, #4] // sdb
+ lsl r7, r7, #5 // 4*sizeof(double)*sdb
+ mov r8, r3 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #8] // beta
+ ldr r6, [fp, #12] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #16] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .type kernel_dsyrk_nt_l_4x4_lib4, %function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dsyrk l nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DSYRK_L_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #0] // beta
+ ldr r6, [fp, #4] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+ // store l
+ ldr r4, [fp, #8] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4 rsp+8
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_lib4, %function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dsyrk l nt
+ mov r4, r0 // kmax
+ mov r5, r1 // A
+ mov r6, r2 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for alpha=1.0 and beta=1.0
+ mov r4, r3 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+ // factorization
+ ldr r4, [fp, #4] // E
+ ldr r5, [fp, #8] // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+ // store l
+ ldr r4, [fp, #0] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4
+// void kernel_dpotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .type kernel_dpotrf_nt_l_4x4_lib4, %function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dsyrk l nt
+ mov r4, r0 // kmax
+ mov r5, r1 // A
+ mov r6, r2 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DSYRK_L_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for alpha=1.0 and beta=1.0
+ mov r4, r3 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+ // factorization
+ ldr r4, [fp, #4] // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+
+ // store l
+ ldr r4, [fp, #0] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12 sp+16 sp+20
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, %function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dsyrk l nt add
+ mov r4, r0 // kp
+ mov r5, r1 // Ap
+ mov r6, r2 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner kernel dsyrk l nt sub
+ mov r4, r3 // kmax
+ ldr r5, [fp, #0] // Am
+ ldr r6, [fp, #4] // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for alpha=1.0 and beta=1.0
+ ldr r4, [fp, #8] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+ // factorization
+ ldr r4, [fp, #16] // E
+ ldr r5, [fp, #20] // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+ // store l
+ ldr r4, [fp, #12] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12 sp+16
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, %function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ fldd d0, .LC00
+ fcpyd d1, d0
+ fcpyd d2, d0
+ fcpyd d3, d0
+ fcpyd d4, d0
+ fcpyd d5, d0
+ fcpyd d6, d0
+ fcpyd d7, d0
+ fcpyd d8, d0
+ fcpyd d9, d0
+ fcpyd d10, d0
+ fcpyd d11, d0
+ fcpyd d12, d0
+ fcpyd d13, d0
+ fcpyd d14, d0
+ fcpyd d15, d0
+
+
+
+ // call inner kernel dsyrk l nt
+ mov r4, r0 // kp
+ mov r5, r1 // Ap
+ mov r6, r2 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DSYRK_L_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dsyrk_l_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner kernel dsyrk l nt sub
+ mov r4, r3 // kmax
+ ldr r5, [fp, #0] // Am
+ ldr r6, [fp, #4] // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DSYRK_L_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_dsyrk_l_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for alpha=1.0 and beta=1.0
+ ldr r4, [fp, #8] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+ // factorization
+ ldr r4, [fp, #16] // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+
+ // store l
+ ldr r4, [fp, #12] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
diff --git a/kernel/armv7a/kernel_sgemm_12x4_lib4.S b/kernel/armv7a/kernel_sgemm_12x4_lib4.S
new file mode 100644
index 0000000..96ff7a4
--- /dev/null
+++ b/kernel/armv7a/kernel_sgemm_12x4_lib4.S
@@ -0,0 +1,589 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- sda
+// r7 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_12x4_lib4, %function
+inner_kernel_gemm_add_nt_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_12x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ add r8, r5, r6 // A1
+ add r9, r8, r6 // A2
+
+ // prefetch
+ pld [r5, #0] // A0
+ pld [r7, #0] // B
+ pld [r8, #0] // A1
+ pld [r9, #0] // A2
+
+ // preload
+ vld1.64 {d0, d1}, [r7:128] // B
+ vld1.64 {d2, d3}, [r5:128] // A0
+ vld1.64 {d4, d5}, [r8:128] // A1
+// vld1.64 {d6, d7}, [r9:128] // A2
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ pld [r5, #64] // A0
+ pld [r7, #64] // B
+ pld [r8, #64] // A1
+ pld [r9, #64] // A2
+
+ // main loop
+1:
+
+ // unroll 0
+ vmla.f32 q4, q1, d0[0]
+ vldr d6, [r9, #0] // A2
+ vmla.f32 q5, q1, d0[1]
+ vldr d7, [r9, #8] // A2
+ vmla.f32 q6, q1, d1[0]
+ pld [r7, #128]
+ vmla.f32 q7, q1, d1[1]
+ vldr d2, [r5, #16] // A0
+ vmla.f32 q8, q2, d0[0]
+ vldr d3, [r5, #24] // A0
+ vmla.f32 q9, q2, d0[1]
+ pld [r5, #128]
+ vmla.f32 q10, q2, d1[0]
+ pld [r8, #128]
+ vmla.f32 q11, q2, d1[1]
+ vldr d4, [r7, #16] // B
+ vmla.f32 q12, q3, d0[0]
+ vldr d5, [r7, #24] // B
+ vmla.f32 q13, q3, d0[1]
+ vldr d0, [r8, #16] // A1
+ vmla.f32 q14, q3, d1[0]
+ pld [r9, #128]
+ vmla.f32 q15, q3, d1[1]
+ vldr d1, [r8, #24] // A1
+
+ // unroll 1
+ vmla.f32 q4, q1, d4[0]
+ vldr d6, [r9, #16] // A2
+ vmla.f32 q5, q1, d4[1]
+ vldr d7, [r9, #24] // A2
+ vmla.f32 q6, q1, d5[0]
+ sub r4, r4, #4
+ vmla.f32 q7, q1, d5[1]
+ vldr d2, [r5, #32] // A0
+ vmla.f32 q8, q0, d4[0]
+ vldr d3, [r5, #40] // A0
+ vmla.f32 q9, q0, d4[1]
+ vmla.f32 q10, q0, d5[0]
+ vmla.f32 q11, q0, d5[1]
+ vldr d0, [r7, #32] // B
+ vmla.f32 q12, q3, d4[0]
+ vldr d1, [r7, #40] // B
+ vmla.f32 q13, q3, d4[1]
+ vldr d4, [r8, #32] // A1
+ vmla.f32 q14, q3, d5[0]
+ vmla.f32 q15, q3, d5[1]
+ vldr d5, [r8, #40] // A1
+
+ // unroll 2
+ vmla.f32 q4, q1, d0[0]
+ vldr d6, [r9, #32] // A2
+ vmla.f32 q5, q1, d0[1]
+ vldr d7, [r9, #40] // A2
+ vmla.f32 q6, q1, d1[0]
+ vmla.f32 q7, q1, d1[1]
+ vldr d2, [r5, #48] // A0
+ vmla.f32 q8, q2, d0[0]
+ vldr d3, [r5, #56] // A0
+ vmla.f32 q9, q2, d0[1]
+ vmla.f32 q10, q2, d1[0]
+ add r5, r5, #64
+ vmla.f32 q11, q2, d1[1]
+ vldr d4, [r7, #48] // B
+ vmla.f32 q12, q3, d0[0]
+ vldr d5, [r7, #56] // B
+ vmla.f32 q13, q3, d0[1]
+ vldr d0, [r8, #48] // A1
+ vmla.f32 q14, q3, d1[0]
+ add r7, r7, #64
+ vmla.f32 q15, q3, d1[1]
+ vldr d1, [r8, #56] // A1
+
+ // unroll 3
+ vmla.f32 q4, q1, d4[0]
+ vldr d6, [r9, #48] // A2
+ vmla.f32 q5, q1, d4[1]
+ vldr d7, [r9, #56] // A2
+ vmla.f32 q6, q1, d5[0]
+ add r8, r8, #64
+ vmla.f32 q7, q1, d5[1]
+ vldr d2, [r5, #0] // A0
+ vmla.f32 q8, q0, d4[0]
+ vldr d3, [r5, #8] // A0
+ vmla.f32 q9, q0, d4[1]
+ add r9, r9, #64
+ vmla.f32 q10, q0, d5[0]
+ cmp r4, #4
+ vmla.f32 q11, q0, d5[1]
+ vldr d0, [r7, #0] // B
+ vmla.f32 q12, q3, d4[0]
+ vldr d1, [r7, #8] // B
+ vmla.f32 q13, q3, d4[1]
+ vldr d4, [r8, #0] // A1
+ vmla.f32 q14, q3, d5[0]
+ vmla.f32 q15, q3, d5[1]
+ vldr d5, [r8, #8] // A1
+
+
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+
+ // unroll 0
+ vmla.f32 q4, q1, d0[0]
+ vldr d6, [r9, #0] // A2
+ vmla.f32 q5, q1, d0[1]
+ vldr d7, [r9, #8] // A2
+ vmla.f32 q6, q1, d1[0]
+ pld [r7, #64]
+ vmla.f32 q7, q1, d1[1]
+ vldr d2, [r5, #16] // A0
+ vmla.f32 q8, q2, d0[0]
+ vldr d3, [r5, #24] // A0
+ vmla.f32 q9, q2, d0[1]
+ pld [r5, #64]
+ vmla.f32 q10, q2, d1[0]
+ pld [r8, #64]
+ vmla.f32 q11, q2, d1[1]
+ vldr d4, [r7, #16] // B
+ vmla.f32 q12, q3, d0[0]
+ vldr d5, [r7, #24] // B
+ vmla.f32 q13, q3, d0[1]
+ vldr d0, [r8, #16] // A1
+ vmla.f32 q14, q3, d1[0]
+ pld [r9, #64]
+ vmla.f32 q15, q3, d1[1]
+ vldr d1, [r8, #24] // A1
+
+ // unroll 1
+ vmla.f32 q4, q1, d4[0]
+ vldr d6, [r9, #16] // A2
+ vmla.f32 q5, q1, d4[1]
+ vldr d7, [r9, #24] // A2
+ vmla.f32 q6, q1, d5[0]
+ sub r4, r4, #4
+ vmla.f32 q7, q1, d5[1]
+ vldr d2, [r5, #32] // A0
+ vmla.f32 q8, q0, d4[0]
+ vldr d3, [r5, #40] // A0
+ vmla.f32 q9, q0, d4[1]
+ vmla.f32 q10, q0, d5[0]
+ vmla.f32 q11, q0, d5[1]
+ vldr d0, [r7, #32] // B
+ vmla.f32 q12, q3, d4[0]
+ vldr d1, [r7, #40] // B
+ vmla.f32 q13, q3, d4[1]
+ vldr d4, [r8, #32] // A1
+ vmla.f32 q14, q3, d5[0]
+ vmla.f32 q15, q3, d5[1]
+ vldr d5, [r8, #40] // A1
+
+ // unroll 2
+ vmla.f32 q4, q1, d0[0]
+ vldr d6, [r9, #32] // A2
+ vmla.f32 q5, q1, d0[1]
+ vldr d7, [r9, #40] // A2
+ vmla.f32 q6, q1, d1[0]
+ vmla.f32 q7, q1, d1[1]
+ vldr d2, [r5, #48] // A0
+ vmla.f32 q8, q2, d0[0]
+ vldr d3, [r5, #56] // A0
+ vmla.f32 q9, q2, d0[1]
+ vmla.f32 q10, q2, d1[0]
+ add r5, r5, #64
+ vmla.f32 q11, q2, d1[1]
+ vldr d4, [r7, #48] // B
+ vmla.f32 q12, q3, d0[0]
+ vldr d5, [r7, #56] // B
+ vmla.f32 q13, q3, d0[1]
+ vldr d0, [r8, #48] // A1
+ vmla.f32 q14, q3, d1[0]
+ add r7, r7, #64
+ vmla.f32 q15, q3, d1[1]
+ vldr d1, [r8, #56] // A1
+
+ // unroll 3
+ vmla.f32 q4, q1, d4[0]
+ vldr d6, [r9, #48] // A2
+ vmla.f32 q5, q1, d4[1]
+ vldr d7, [r9, #56] // A2
+ vmla.f32 q6, q1, d5[0]
+ add r9, r9, #64
+ vmla.f32 q7, q1, d5[1]
+// vldr d2, [r5, #0] // A0
+ vmla.f32 q8, q0, d4[0]
+// vldr d3, [r5, #8] // A0
+ vmla.f32 q9, q0, d4[1]
+ vmla.f32 q10, q0, d5[0]
+ add r8, r8, #64
+ vmla.f32 q11, q0, d5[1]
+// vldr d0, [r7, #0] // B
+ vmla.f32 q12, q3, d4[0]
+// vldr d1, [r7, #8] // B
+ vmla.f32 q13, q3, d4[1]
+// vldr d4, [r8, #0] // A1
+ vmla.f32 q14, q3, d5[0]
+ vmla.f32 q15, q3, d5[1]
+// vldr d5, [r8, #8] // A1
+
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+
+3: // clean1-up loop
+
+ // unroll 0
+ vld1.64 {d4, d5}, [r7:128]! // B
+ vld1.64 {d0, d1}, [r5:128]! // A0
+ vmla.f32 q4, q0, d4[0]
+ sub r4, r4, #1
+ vmla.f32 q5, q0, d4[1]
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+ vld1.64 {d0, d1}, [r8:128]! // A1
+ vmla.f32 q8, q0, d4[0]
+ vmla.f32 q9, q0, d4[1]
+ vmla.f32 q10, q0, d5[0]
+ vmla.f32 q11, q0, d5[1]
+ vld1.64 {d0, d1}, [r8:128]! // A1
+ vmla.f32 q12, q0, d4[0]
+ vmla.f32 q13, q0, d4[1]
+ vmla.f32 q14, q0, d5[0]
+ vmla.f32 q15, q0, d5[1]
+
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_12x4_lib4, .-inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- alpha
+// r5 <- beta
+// r6 <- C
+// r7 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_12X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_12x4_lib4, %function
+inner_scale_ab_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_12x4_lib4:
+#endif
+#endif
+
+ flds s8, [r4, #0] // alpha
+
+ vmul.f32 q4, q4, d4[0]
+ flds s9, [r5, #0] // beta
+ vmul.f32 q5, q5, d4[0]
+ flds s10, .LC00 // 0.0
+ vmul.f32 q6, q6, d4[0]
+ vmul.f32 q7, q7, d4[0]
+ fcmpes s9, s10
+ vmul.f32 q8, q8, d4[0]
+ vmul.f32 q9, q9, d4[0]
+ vmul.f32 q10, q10, d4[0]
+ vmul.f32 q11, q11, d4[0]
+ vmul.f32 q12, q12, d4[0]
+ vmul.f32 q13, q13, d4[0]
+ vmul.f32 q14, q14, d4[0]
+ vmul.f32 q15, q15, d4[0]
+ fmstat
+
+ beq 0f // end
+
+ add r8, r6, r7
+ add r9, r8, r7
+
+ vld1.64 {d0, d1, d2, d3}, [r6:128]!
+ vmla.f32 q4, q0, d4[1]
+ vmla.f32 q5, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r6:128]!
+ vmla.f32 q6, q0, d4[1]
+ vmla.f32 q7, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r8:128]!
+ vmla.f32 q8, q0, d4[1]
+ vmla.f32 q9, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r8:128]!
+ vmla.f32 q10, q0, d4[1]
+ vmla.f32 q11, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r9:128]!
+ vmla.f32 q12, q0, d4[1]
+ vmla.f32 q13, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r9:128]!
+ vmla.f32 q14, q0, d4[1]
+ vmla.f32 q15, q1, d4[1]
+
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_12x4_lib4, .-inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- D
+// r5 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_12X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_12x4_lib4, %function
+inner_store_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_12x4_lib4:
+#endif
+#endif
+
+ add r6, r4, r5
+ add r7, r6, r5
+
+ vst1.64 {d8, d9, d10, d11}, [r4:128]!
+ vst1.64 {d12, d13, d14, d15}, [r4:128]!
+ vst1.64 {d16, d17, d18, d19}, [r6:128]!
+ vst1.64 {d20, d21, d22, d23}, [r6:128]!
+ vst1.64 {d24, d25, d26, d27}, [r7:128]!
+ vst1.64 {d28, d29, d30, d31}, [r7:128]!
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_store_12x4_lib4, .-inner_store_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// zero double word
+ .align 3
+.LC00: // { 0 }
+ .word 0
+ .word 0
+ .word 0
+ .word 0
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12 sp+16 sp+20
+// void kernel_sgemm_nt_12x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_sgemm_nt_12x4_lib4
+ .type kernel_sgemm_nt_12x4_lib4, %function
+kernel_sgemm_nt_12x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_sgemm_nt_12x4_lib4
+_kernel_sgemm_nt_12x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ vldr d8, .LC00
+ vldr d9, .LC00+8
+ vmov q5, q4
+ vmov q6, q4
+ vmov q7, q4
+ vmov q8, q4
+ vmov q9, q4
+ vmov q10, q4
+ vmov q11, q4
+ vmov q12, q4
+ vmov q13, q4
+ vmov q14, q4
+ vmov q15, q4
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // sda
+ lsl r6, r6, #4 // 4*sizeof(float)*sda
+ ldr r7, [fp, #0] // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_gemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #4] // beta
+ ldr r6, [fp, #8] // C
+ ldr r7, [fp, #12] // sdc
+ lsl r7, r7, #4 // 4*sizeof(float)*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #16] // D
+ ldr r5, [fp, #20] // sdd
+ lsl r5, r5, #4 // 4*sizeof(float)*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_12x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_12x4_lib4, .-kernel_sgemm_nt_12x4_lib4
+#endif
+
+
+
+
diff --git a/kernel/armv7a/kernel_sgemm_4x4_lib4.S b/kernel/armv7a/kernel_sgemm_4x4_lib4.S
new file mode 100644
index 0000000..e8a2e71
--- /dev/null
+++ b/kernel/armv7a/kernel_sgemm_4x4_lib4.S
@@ -0,0 +1,675 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_4x4_lib4, %function
+inner_kernel_gemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vld1.64 {d4, d5}, [r6:128]! // B
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, #64]
+
+ // unroll 0
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d4[1]
+ vld1.64 {d6, d7}, [r6:128]! // B
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+
+ // unroll 1
+ vmla.f32 q4, q1, d6[0]
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q5, q1, d6[1]
+ vld1.64 {d4, d5}, [r6:128]! // B
+ vmla.f32 q6, q1, d7[0]
+ vmla.f32 q7, q1, d7[1]
+
+ // unroll 2
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d4[1]
+ vld1.64 {d6, d7}, [r6:128]! // B
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+
+ // unroll 3
+ vmla.f32 q4, q1, d6[0]
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q5, q1, d6[1]
+ vld1.64 {d4, d5}, [r6:128]! // B
+ vmla.f32 q6, q1, d7[0]
+ vmla.f32 q7, q1, d7[1]
+
+ sub r4, r4, #4
+
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d4[1]
+ vld1.64 {d6, d7}, [r6:128]! // B
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+
+ // unroll 1
+ vmla.f32 q4, q1, d6[0]
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q5, q1, d6[1]
+ vld1.64 {d4, d5}, [r6:128]! // B
+ vmla.f32 q6, q1, d7[0]
+ vmla.f32 q7, q1, d7[1]
+
+ // unroll 2
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d4[1]
+ vld1.64 {d6, d7}, [r6:128]! // B
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+
+ // unroll 3
+ vmla.f32 q4, q1, d6[0]
+// vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q5, q1, d6[1]
+// vld1.64 {d4, d5}, [r6:128]! // B
+ vmla.f32 q6, q1, d7[0]
+ vmla.f32 q7, q1, d7[1]
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+ sub r5, r5, #16
+ sub r6, r6, #16
+
+3: // clean1-up loop
+
+ // unroll 0
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vld1.64 {d4, d5}, [r6:128]! // B
+ vmla.f32 q4, q0, d4[0]
+ vmla.f32 q5, q0, d4[1]
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_4x4_lib4, .-inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+// r7 <- 4*sdb*sizeof(float)
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_4x4_lib4, %function
+inner_kernel_gemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vldr d4, [r6, #0] // B[0,1]
+ vldr d5, [r6, #16] // B[4,5]
+ vldr d6, [r6, #32] // B[8,9]
+ vldr d7, [r6, #48] // B[12,13]
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, r7]
+
+ // unroll 0
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 1
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+ vldr d4, [r6, #8] // B[2,3]
+ vmla.f32 q5, q1, d5[1]
+ vldr d5, [r6, #24] // B[6,7]
+ vmla.f32 q6, q1, d6[1]
+ vldr d6, [r6, #40] // B[10,11]
+ vmla.f32 q7, q1, d7[1]
+ vldr d7, [r6, #56] // B[14,15]
+
+ // unroll 2
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ add r6, r6, r7
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 3
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+ vldr d4, [r6, #0] // B[0,1]
+ vmla.f32 q5, q1, d5[1]
+ vldr d5, [r6, #16] // B[4,5]
+ vmla.f32 q6, q1, d6[1]
+ vldr d6, [r6, #32] // B[8,9]
+ vmla.f32 q7, q1, d7[1]
+ vldr d7, [r6, #48] // B[12,13]
+
+ sub r4, r4, #4
+
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 1
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+ vldr d4, [r6, #8] // B[2,3]
+ vmla.f32 q5, q1, d5[1]
+ vldr d5, [r6, #24] // B[6,7]
+ vmla.f32 q6, q1, d6[1]
+ vldr d6, [r6, #40] // B[10,11]
+ vmla.f32 q7, q1, d7[1]
+ vldr d7, [r6, #56] // B[14,15]
+
+ // unroll 2
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ add r6, r6, r7
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 3
+// vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+// vldr d4, [r6, #0] // B[0,1]
+ vmla.f32 q5, q1, d5[1]
+// vldr d5, [r6, #16] // B[4,5]
+ vmla.f32 q6, q1, d6[1]
+// vldr d6, [r6, #32] // B[8,9]
+ vmla.f32 q7, q1, d7[1]
+// vldr d7, [r6, #48] // B[12,13]
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+ sub r5, r5, #16
+
+3: // clean1-up loop
+
+ // unroll 0
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vldr s8, [r6, #0] // B[0]
+ vmla.f32 q4, q0, d4[0]
+ vldr s8, [r6, #16] // B[4]
+ vmla.f32 q5, q0, d4[0]
+ vldr s8, [r6, #32] // B[8]
+ vmla.f32 q6, q0, d4[0]
+ vldr s8, [r6, #48] // B[12]
+ vmla.f32 q7, q0, d4[0]
+
+ sub r4, r4, #1
+ add r6, r6, #4
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_4x4_lib4, .-inner_kernel_gemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- alpha
+// r5 <- beta
+// r6 <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ flds s8, [r4, #0] // alpha
+
+ vmul.f32 q4, q4, d4[0]
+ flds s9, [r5, #0] // beta
+ vmul.f32 q5, q5, d4[0]
+ flds s10, .LC00 // 0.0
+ vmul.f32 q6, q6, d4[0]
+ fcmpes s9, s10
+ vmul.f32 q7, q7, d4[0]
+ fmstat
+
+ beq 0f // end
+
+ vld1.64 {d0, d1, d2, d3}, [r6:128]!
+ vmla.f32 q4, q0, d4[1]
+ vmla.f32 q5, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r6:128]!
+ vmla.f32 q6, q0, d4[1]
+ vmla.f32 q7, q1, d4[1]
+
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#endif
+#endif
+
+ vst1.64 {d8, d9, d10, d11}, [r4:128]!
+ vst1.64 {d12, d13, d14, d15}, [r4:128]!
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// zero double word
+ .align 3
+.LC00: // { 0 }
+ .word 0
+ .word 0
+ .word 0
+ .word 0
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8
+// void kernel_sgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_sgemm_nt_4x4_lib4
+ .type kernel_sgemm_nt_4x4_lib4, %function
+kernel_sgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_sgemm_nt_4x4_lib4
+_kernel_sgemm_nt_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ vldr d8, .LC00
+ vldr d9, .LC00+8
+ vmov q5, q4
+ vmov q6, q4
+ vmov q7, q4
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_gemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #0] // beta
+ ldr r6, [fp, #4] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #8] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x4_lib4, .-kernel_sgemm_nt_4x4_lib4
+#endif
+
+
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12
+// void kernel_sgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_sgemm_nn_4x4_lib4
+ .type kernel_sgemm_nn_4x4_lib4, %function
+kernel_sgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_sgemm_nn_4x4_lib4
+_kernel_sgemm_nn_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ vldr d8, .LC00
+ vldr d9, .LC00+8
+ vmov q5, q4
+ vmov q6, q4
+ vmov q7, q4
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // B
+ ldr r7, [fp, #0] // sdb
+ lsl r7, r7, #4 // 4*sizeof(float)*sdb
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_gemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_gemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #4] // beta
+ ldr r6, [fp, #8] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #12] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_4x4_lib4, .-kernel_sgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+
diff --git a/kernel/armv7a/kernel_sgemm_8x4_lib4.S b/kernel/armv7a/kernel_sgemm_8x4_lib4.S
new file mode 100644
index 0000000..f356c9b
--- /dev/null
+++ b/kernel/armv7a/kernel_sgemm_8x4_lib4.S
@@ -0,0 +1,795 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- sda
+// r7 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_8x4_lib4, %function
+inner_kernel_gemm_add_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ add r8, r5, r6 // A1
+
+ // prefetch
+ pld [r5, #0]
+ pld [r7, #0]
+ pld [r8, #0]
+ pld [r7, #64]
+
+ // preload
+ vld1.64 {d0, d1}, [r7:128]! // B // TODO preload B in d0-d3 too ?????
+ vld1.64 {d2, d3}, [r7:128]! // B
+ vld1.64 {d4, d5}, [r7:128]! // B // TODO preload B in d0-d3 too ?????
+ vld1.64 {d6, d7}, [r7:128]! // B
+ vld1.64 {d24, d25}, [r5:128]! // A0
+ vld1.64 {d28, d29}, [r5:128]! // A0
+ vld1.64 {d26, d27}, [r8:128] // A1
+
+ sub r7, r7, #64
+ sub r5, r5, #32
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // unroll 0
+ pld [r5, #64] // A0
+ vmla.f32 q4, q12, d0[0]
+ vldr d30, [r8, #16] // A1
+ vmla.f32 q5, q12, d0[1]
+ vldr d31, [r8, #24] // A1
+ vmla.f32 q6, q12, d1[0]
+ pld [r7, #128] // B
+ vmla.f32 q7, q12, d1[1]
+ vldr d24, [r5, #32]
+ vmla.f32 q8, q13, d0[0]
+ vldr d25, [r5, #40]
+ vmla.f32 q9, q13, d0[1]
+ vldr d0, [r7, #64]
+ vmla.f32 q10, q13, d1[0]
+ pld [r8, #64] // A1
+ vmla.f32 q11, q13, d1[1]
+ vldr d1, [r7, #72]
+
+ // unroll 1
+ vmla.f32 q4, q14, d2[0]
+ vldr d26, [r8, #32] // A1
+ vmla.f32 q5, q14, d2[1]
+ vldr d27, [r8, #40] // A1
+ vmla.f32 q6, q14, d3[0]
+ vmla.f32 q7, q14, d3[1]
+ vldr d28, [r5, #48]
+ vmla.f32 q8, q15, d2[0]
+ vldr d29, [r5, #56]
+ vmla.f32 q9, q15, d2[1]
+ vldr d2, [r7, #80]
+ vmla.f32 q10, q15, d3[0]
+ add r5, r5, #64
+ vmla.f32 q11, q15, d3[1]
+ vldr d3, [r7, #88]
+
+ // unroll 2
+ vmla.f32 q4, q12, d4[0]
+ vldr d30, [r8, #48] // A1
+ vmla.f32 q5, q12, d4[1]
+ vldr d31, [r8, #56] // A1
+ vmla.f32 q6, q12, d5[0]
+ add r7, r7, #64
+ vmla.f32 q7, q12, d5[1]
+ vldr d24, [r5, #0]
+ vmla.f32 q8, q13, d4[0]
+ vldr d25, [r5, #8]
+ vmla.f32 q9, q13, d4[1]
+ vldr d4, [r7, #32]
+ vmla.f32 q10, q13, d5[0]
+ add r8, r8, #64
+ vmla.f32 q11, q13, d5[1]
+ vldr d5, [r7, #40]
+
+ // unroll 3
+ vmla.f32 q4, q14, d6[0]
+ vldr d26, [r8, #0] // A1
+ vmla.f32 q5, q14, d6[1]
+ vldr d27, [r8, #8] // A1
+ vmla.f32 q6, q14, d7[0]
+ sub r4, r4, #4
+ vmla.f32 q7, q14, d7[1]
+ vldr d28, [r5, #16]
+ vmla.f32 q8, q15, d6[0]
+ vldr d29, [r5, #24]
+ vmla.f32 q9, q15, d6[1]
+ vldr d6, [r7, #48]
+ vmla.f32 q10, q15, d7[0]
+ vmla.f32 q11, q15, d7[1]
+ vldr d7, [r7, #56]
+
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+
+ // unroll 0
+ vmla.f32 q4, q12, d0[0]
+ vldr d30, [r8, #16] // A1
+ vmla.f32 q5, q12, d0[1]
+ vldr d31, [r8, #24] // A1
+ vmla.f32 q6, q12, d1[0]
+ vmla.f32 q7, q12, d1[1]
+ vldr d24, [r5, #32]
+ vmla.f32 q8, q13, d0[0]
+ vldr d25, [r5, #40]
+ vmla.f32 q9, q13, d0[1]
+// vldr d4, [r7, #64]
+ vmla.f32 q10, q13, d1[0]
+ vmla.f32 q11, q13, d1[1]
+// vldr d5, [r7, #72]
+
+ // unroll 1
+ vmla.f32 q4, q14, d2[0]
+ vldr d26, [r8, #32] // A1
+ vmla.f32 q5, q14, d2[1]
+ vldr d27, [r8, #40] // A1
+ vmla.f32 q6, q14, d3[0]
+ vmla.f32 q7, q14, d3[1]
+ vldr d28, [r5, #48]
+ vmla.f32 q8, q15, d2[0]
+ vldr d29, [r5, #56]
+ vmla.f32 q9, q15, d2[1]
+// vldr d6, [r7, #80]
+ vmla.f32 q10, q15, d3[0]
+// add r5, r5, #64
+ vmla.f32 q11, q15, d3[1]
+// vldr d7, [r7, #88]
+
+ // unroll 2
+ vmla.f32 q4, q12, d4[0]
+ vldr d30, [r8, #48] // A1
+ vmla.f32 q5, q12, d4[1]
+ vldr d31, [r8, #56] // A1
+ vmla.f32 q6, q12, d5[0]
+// add r7, r7, #64
+ vmla.f32 q7, q12, d5[1]
+// vldr d24, [r5, #0]
+ vmla.f32 q8, q13, d4[0]
+// vldr d25, [r5, #8]
+ vmla.f32 q9, q13, d4[1]
+// vldr d4, [r7, #32]
+ vmla.f32 q10, q13, d5[0]
+// add r8, r8, #64
+ vmla.f32 q11, q13, d5[1]
+// vldr d5, [r7, #40]
+
+ // unroll 3
+ vmla.f32 q4, q14, d6[0]
+// vldr d26, [r8, #0] // A1
+ vmla.f32 q5, q14, d6[1]
+// vldr d27, [r8, #8] // A1
+ vmla.f32 q6, q14, d7[0]
+ sub r4, r4, #4
+ vmla.f32 q7, q14, d7[1]
+// vldr d28, [r5, #16]
+ vmla.f32 q8, q15, d6[0]
+// vldr d29, [r5, #24]
+ vmla.f32 q9, q15, d6[1]
+// vldr d6, [r7, #48]
+ vmla.f32 q10, q15, d7[0]
+ vmla.f32 q11, q15, d7[1]
+// vldr d7, [r7, #56]
+
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+// sub r5, r5, #32 // A0
+// sub r7, r7, #32 // B
+// sub r8, r8, #16 // A1
+
+3: // clean1-up loop
+
+ // unroll 0
+ vld1.64 {d4, d5}, [r7:128]! // B
+ vld1.64 {d0, d1}, [r5:128]! // A0
+ vmla.f32 q4, q0, d4[0]
+ vmla.f32 q5, q0, d4[1]
+ vmla.f32 q6, q0, d5[0]
+ vmla.f32 q7, q0, d5[1]
+ vld1.64 {d0, d1}, [r8:128]! // A1
+ vmla.f32 q8, q0, d4[0]
+ vmla.f32 q9, q0, d4[1]
+ vmla.f32 q10, q0, d5[0]
+ vmla.f32 q11, q0, d5[1]
+
+ sub r4, r4, #1
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_8x4_lib4, .-inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+#if 0
+// subroutine
+//
+// input arguments:
+// r4 <- k
+// r5 <- A
+// r6 <- B
+// r7 <- 4*sdb*sizeof(float)
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_4x4_lib4, %function
+inner_kernel_gemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ // early return
+ cmp r4, #0
+ ble 2f // return
+
+ // prefetch
+ pld [r5, #0]
+ pld [r6, #0]
+
+ // preload A
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vldr d4, [r6, #0] // B[0,1]
+ vldr d5, [r6, #16] // B[4,5]
+ vldr d6, [r6, #32] // B[8,9]
+ vldr d7, [r6, #48] // B[12,13]
+
+ cmp r4, #4
+ ble 0f // consider clean up loop
+
+ // main loop
+1:
+
+ // prefetch
+ pld [r5, #64]
+ pld [r6, r7]
+
+ // unroll 0
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 1
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+ vldr d4, [r6, #8] // B[2,3]
+ vmla.f32 q5, q1, d5[1]
+ vldr d5, [r6, #24] // B[6,7]
+ vmla.f32 q6, q1, d6[1]
+ vldr d6, [r6, #40] // B[10,11]
+ vmla.f32 q7, q1, d7[1]
+ vldr d7, [r6, #56] // B[14,15]
+
+ // unroll 2
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ add r6, r6, r7
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 3
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+ vldr d4, [r6, #0] // B[0,1]
+ vmla.f32 q5, q1, d5[1]
+ vldr d5, [r6, #16] // B[4,5]
+ vmla.f32 q6, q1, d6[1]
+ vldr d6, [r6, #32] // B[8,9]
+ vmla.f32 q7, q1, d7[1]
+ vldr d7, [r6, #48] // B[12,13]
+
+ sub r4, r4, #4
+
+ cmp r4, #4
+ bgt 1b
+
+0:
+
+ cmp r4, #3
+ ble 4f
+
+ // unroll 0
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 1
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+ vldr d4, [r6, #8] // B[2,3]
+ vmla.f32 q5, q1, d5[1]
+ vldr d5, [r6, #24] // B[6,7]
+ vmla.f32 q6, q1, d6[1]
+ vldr d6, [r6, #40] // B[10,11]
+ vmla.f32 q7, q1, d7[1]
+ vldr d7, [r6, #56] // B[14,15]
+
+ // unroll 2
+ vmla.f32 q4, q0, d4[0]
+ vld1.64 {d2, d3}, [r5:128]! // A
+ vmla.f32 q5, q0, d5[0]
+ add r6, r6, r7
+ vmla.f32 q6, q0, d6[0]
+ vmla.f32 q7, q0, d7[0]
+
+ // unroll 3
+// vld1.64 {d0, d1}, [r5:128]! // A
+ vmla.f32 q4, q1, d4[1]
+// vldr d4, [r6, #0] // B[0,1]
+ vmla.f32 q5, q1, d5[1]
+// vldr d5, [r6, #16] // B[4,5]
+ vmla.f32 q6, q1, d6[1]
+// vldr d6, [r6, #32] // B[8,9]
+ vmla.f32 q7, q1, d7[1]
+// vldr d7, [r6, #48] // B[12,13]
+
+ sub r4, r4, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp r4, #0
+ ble 2f // return
+
+ sub r5, r5, #16
+
+3: // clean1-up loop
+
+ // unroll 0
+ vld1.64 {d0, d1}, [r5:128]! // A
+ vldr s8, [r6, #0] // B[0]
+ vmla.f32 q4, q0, d4[0]
+ vldr s8, [r6, #16] // B[4]
+ vmla.f32 q5, q0, d4[0]
+ vldr s8, [r6, #32] // B[8]
+ vmla.f32 q6, q0, d4[0]
+ vldr s8, [r6, #48] // B[12]
+ vmla.f32 q7, q0, d4[0]
+
+ sub r4, r4, #1
+ add r6, r6, #4
+ cmp r4, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_4x4_lib4, .-inner_kernel_gemm_add_nn_4x4_lib4
+#endif
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- alpha
+// r5 <- beta
+// r6 <- C
+// r7 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_8X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_lib4, %function
+inner_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib4:
+#endif
+#endif
+
+ flds s8, [r4, #0] // alpha
+
+ vmul.f32 q4, q4, d4[0]
+ flds s9, [r5, #0] // beta
+ vmul.f32 q5, q5, d4[0]
+ flds s10, .LC00 // 0.0
+ vmul.f32 q6, q6, d4[0]
+ vmul.f32 q7, q7, d4[0]
+ fcmpes s9, s10
+ vmul.f32 q8, q8, d4[0]
+ vmul.f32 q9, q9, d4[0]
+ vmul.f32 q10, q10, d4[0]
+ vmul.f32 q11, q11, d4[0]
+ fmstat
+
+ beq 0f // end
+
+ add r8, r6, r7
+
+ vld1.64 {d0, d1, d2, d3}, [r6:128]!
+ vmla.f32 q4, q0, d4[1]
+ vmla.f32 q5, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r6:128]!
+ vmla.f32 q6, q0, d4[1]
+ vmla.f32 q7, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r8:128]!
+ vmla.f32 q8, q0, d4[1]
+ vmla.f32 q9, q1, d4[1]
+ vld1.64 {d0, d1, d2, d3}, [r8:128]!
+ vmla.f32 q10, q0, d4[1]
+ vmla.f32 q11, q1, d4[1]
+
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r4 <- D
+// r5 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_8X4_LIB4
+#else
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_lib4, %function
+inner_store_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib4:
+#endif
+#endif
+
+ add r6, r4, r5
+
+ vst1.64 {d8, d9, d10, d11}, [r4:128]!
+ vst1.64 {d12, d13, d14, d15}, [r4:128]!
+ vst1.64 {d16, d17, d18, d19}, [r6:128]!
+ vst1.64 {d20, d21, d22, d23}, [r6:128]!
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ mov pc, lr // return
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// zero double word
+ .align 3
+.LC00: // { 0 }
+ .word 0
+ .word 0
+ .word 0
+ .word 0
+
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12 sp+16 sp+20
+// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_sgemm_nt_8x4_lib4
+ .type kernel_sgemm_nt_8x4_lib4, %function
+kernel_sgemm_nt_8x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_sgemm_nt_8x4_lib4
+_kernel_sgemm_nt_8x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ vldr d8, .LC00
+ vldr d9, .LC00+8
+ vmov q5, q4
+ vmov q6, q4
+ vmov q7, q4
+ vmov q8, q4
+ vmov q9, q4
+ vmov q10, q4
+ vmov q11, q4
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // sda
+ lsl r6, r6, #4 // 4*sizeof(float)*sda
+ ldr r7, [fp, #0] // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_gemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #4] // beta
+ ldr r6, [fp, #8] // C
+ ldr r7, [fp, #12] // sdc
+ lsl r7, r7, #4 // 4*sizeof(float)*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #16] // D
+ ldr r5, [fp, #20] // sdd
+ lsl r5, r5, #4 // 4*sizeof(float)*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_lib4, .-kernel_sgemm_nt_8x4_lib4
+#endif
+
+
+
+#if 0
+// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12
+// void kernel_sgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D)
+
+// .p2align 4,,15
+#if defined(OS_LINUX)
+ .global kernel_sgemm_nn_4x4_lib4
+ .type kernel_sgemm_nn_4x4_lib4, %function
+kernel_sgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .global kernel_sgemm_nn_4x4_lib4
+_kernel_sgemm_nn_4x4_lib4:
+#endif
+
+ // prologue
+
+ // save GP registers
+ stmdb sp!, {r4 - r10, fp, lr} // save registers
+ add fp, sp, #36 // fp to old sp position
+
+ // save FP registers
+ fstmfdd sp!, {d8-d15}
+
+
+
+ // zero accumulation registers
+ vldr d8, .LC00
+ vldr d9, .LC00+8
+ vmov q5, q4
+ vmov q6, q4
+ vmov q7, q4
+
+
+
+ // call inner kernel dgemm nt
+ mov r4, r0 // kmax
+ mov r5, r2 // A
+ mov r6, r3 // B
+ ldr r7, [fp, #0] // sdb
+ lsl r7, r7, #4 // 4*sizeof(float)*sdb
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_kernel_gemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_kernel_gemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov r4, r1 // alpha
+ ldr r5, [fp, #4] // beta
+ ldr r6, [fp, #8] // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+ // store n
+ ldr r4, [fp, #12] // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX)
+ bl inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ bl _inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+ // epilogue
+
+ // load FP registers
+ fldmfdd sp!, {d8-d15}
+
+ // load GP registers and return
+// ldmia sp!, {r4 - r10, fp, lr} // load registers
+// mov pc, lr // return
+ ldmia sp!, {r4 - r10, fp, pc} // load registers and return
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_4x4_lib4, .-kernel_sgemm_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+
diff --git a/kernel/armv8a/Makefile b/kernel/armv8a/Makefile
new file mode 100644
index 0000000..75e1faf
--- /dev/null
+++ b/kernel/armv8a/Makefile
@@ -0,0 +1,49 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += kernel_dgemm_8x4_lib4.o kernel_dgemm_4x4_lib4.o
+OBJS += kernel_sgemm_16x4_lib4.o kernel_sgemm_12x4_lib4.o kernel_sgemm_8x8_lib4.o kernel_sgemm_8x4_lib4.o kernel_sgemm_4x4_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/armv8a/kernel_dgemm_4x4_lib4.S b/kernel/armv8a/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..2d43b10
--- /dev/null
+++ b/kernel/armv8a/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,414 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ add sp, sp, #-(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, %function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ // prefetch
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x10, #0]
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // preload
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+
+ // prefetch
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x10, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ fmla v0.2d, v24.2d, v28.2d[0]
+ ld1 {v26.2d, v27.2d}, [x9], #32
+ fmla v1.2d, v25.2d, v28.2d[0]
+ ld1 {v30.2d, v31.2d}, [x10], #32
+ fmla v2.2d, v24.2d, v28.2d[1]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ prfm PLDL1KEEP, [x10, #64]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+ sub w8, w8, #4
+
+ // unroll 1
+ fmla v0.2d, v26.2d, v30.2d[0]
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ fmla v1.2d, v27.2d, v30.2d[0]
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v2.2d, v26.2d, v30.2d[1]
+ fmla v3.2d, v27.2d, v30.2d[1]
+ fmla v4.2d, v26.2d, v31.2d[0]
+ fmla v5.2d, v27.2d, v31.2d[0]
+ fmla v6.2d, v26.2d, v31.2d[1]
+ fmla v7.2d, v27.2d, v31.2d[1]
+
+ // unroll 2
+ fmla v0.2d, v24.2d, v28.2d[0]
+ ld1 {v26.2d, v27.2d}, [x9], #32
+ fmla v1.2d, v25.2d, v28.2d[0]
+ ld1 {v30.2d, v31.2d}, [x10], #32
+ fmla v2.2d, v24.2d, v28.2d[1]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ prfm PLDL1KEEP, [x10, #64]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ // unroll 3
+ fmla v0.2d, v26.2d, v30.2d[0]
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ fmla v1.2d, v27.2d, v30.2d[0]
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v2.2d, v26.2d, v30.2d[1]
+ fmla v3.2d, v27.2d, v30.2d[1]
+ fmla v4.2d, v26.2d, v31.2d[0]
+ fmla v5.2d, v27.2d, v31.2d[0]
+ fmla v6.2d, v26.2d, v31.2d[1]
+ fmla v7.2d, v27.2d, v31.2d[1]
+
+ cmp w8, #4
+ bgt 1b
+
+ sub x9, x9, #32
+ sub x10, x10, #32
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ // unroll 1
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ // unroll 2
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ // unroll 3
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ sub w8, w8, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ // unroll 0
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v24.2d, v28.2d[1]
+ fmla v3.2d, v25.2d, v28.2d[1]
+ fmla v4.2d, v24.2d, v29.2d[0]
+ fmla v5.2d, v25.2d, v29.2d[0]
+ fmla v6.2d, v24.2d, v29.2d[1]
+ fmla v7.2d, v25.2d, v29.2d[1]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#endif
+
+ ld1 {v28.2d}, [x8]
+
+ fmul v0.2d, v0.2d, v28.2d[0]
+ fmul v1.2d, v1.2d, v28.2d[0]
+ fmul v2.2d, v2.2d, v28.2d[0]
+ fmul v3.2d, v3.2d, v28.2d[0]
+ fmul v4.2d, v4.2d, v28.2d[0]
+ fmul v5.2d, v5.2d, v28.2d[0]
+ fmul v6.2d, v6.2d, v28.2d[0]
+ fmul v7.2d, v7.2d, v28.2d[0]
+
+ ld1 {v28.2d}, [x9]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v26.2d, v28.2d[0]
+ fmla v3.2d, v27.2d, v28.2d[0]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+ fmla v4.2d, v24.2d, v28.2d[0]
+ fmla v5.2d, v25.2d, v28.2d[0]
+ fmla v6.2d, v26.2d, v28.2d[0]
+ fmla v7.2d, v27.2d, v28.2d[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .align 4
+ .type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#endif
+
+ st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x8], #64
+ st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x8], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 x3 x4 x5 x6
+// void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+ .align 4
+ .global kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, %function
+kernel_dgemm_nt_4x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+
+
+
+ // call inner kernel dgemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov x10, x3 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ bl inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x4 // beta
+ mov x10, x5 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+ bl inner_scale_ab_4x4_lib4
+#endif
+
+
+
+ // store n
+ mov x8, x6
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+ bl inner_store_4x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
diff --git a/kernel/armv8a/kernel_dgemm_8x4_lib4.S b/kernel/armv8a/kernel_dgemm_8x4_lib4.S
new file mode 100644
index 0000000..314489d
--- /dev/null
+++ b/kernel/armv8a/kernel_dgemm_8x4_lib4.S
@@ -0,0 +1,575 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_8x4_lib4, %function
+inner_kernel_gemm_add_nt_8x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x12, x9, x10
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x12, #0]
+
+ // preload
+ ldp d24, d25, [x11], #16
+ ldp d26, d27, [x11], #16
+ ldp q16, q17, [x9], #32
+ ldp q20, q21, [x12], #32
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x12, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ ldp d28, d29, [x11], #16
+ fmla v0.2d, v16.2d, v24.2d[0]
+ fmla v1.2d, v17.2d, v24.2d[0]
+ ldp d30, d31, [x11], #16
+ fmla v2.2d, v16.2d, v25.2d[0]
+ fmla v3.2d, v17.2d, v25.2d[0]
+ ldr q18, [x9], #16
+ fmla v4.2d, v16.2d, v26.2d[0]
+ fmla v5.2d, v17.2d, v26.2d[0]
+ ldr q19, [x9], #16
+ fmla v6.2d, v16.2d, v27.2d[0]
+ fmla v7.2d, v17.2d, v27.2d[0]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v8.2d, v20.2d, v24.2d[0]
+ fmla v9.2d, v21.2d, v24.2d[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v10.2d, v20.2d, v25.2d[0]
+ fmla v11.2d, v21.2d, v25.2d[0]
+ ldp q22, q23, [x12], #32
+ fmla v12.2d, v20.2d, v26.2d[0]
+ fmla v13.2d, v21.2d, v26.2d[0]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v14.2d, v20.2d, v27.2d[0]
+ fmla v15.2d, v21.2d, v27.2d[0]
+
+ // unroll 1
+ ldp d24, d25, [x11], #16
+ fmla v0.2d, v18.2d, v28.2d[0]
+ fmla v1.2d, v19.2d, v28.2d[0]
+ ldp d26, d27, [x11], #16
+ fmla v2.2d, v18.2d, v29.2d[0]
+ fmla v3.2d, v19.2d, v29.2d[0]
+ ldr q16, [x9], #16
+ fmla v4.2d, v18.2d, v30.2d[0]
+ fmla v5.2d, v19.2d, v30.2d[0]
+ ldr q17, [x9], #16
+ fmla v6.2d, v18.2d, v31.2d[0]
+ fmla v7.2d, v19.2d, v31.2d[0]
+ ldr q20, [x12], #16
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+ ldr q21, [x12], #16
+ fmla v10.2d, v22.2d, v29.2d[0]
+ fmla v11.2d, v23.2d, v29.2d[0]
+ sub w8, w8, #4
+ fmla v12.2d, v22.2d, v30.2d[0]
+ fmla v13.2d, v23.2d, v30.2d[0]
+ fmla v14.2d, v22.2d, v31.2d[0]
+ fmla v15.2d, v23.2d, v31.2d[0]
+
+ // unroll 2
+ ldp d28, d29, [x11], #16
+ fmla v0.2d, v16.2d, v24.2d[0]
+ fmla v1.2d, v17.2d, v24.2d[0]
+ ldp d30, d31, [x11], #16
+ fmla v2.2d, v16.2d, v25.2d[0]
+ fmla v3.2d, v17.2d, v25.2d[0]
+ ldr q18, [x9], #16
+ fmla v4.2d, v16.2d, v26.2d[0]
+ fmla v5.2d, v17.2d, v26.2d[0]
+ ldr q19, [x9], #16
+ fmla v6.2d, v16.2d, v27.2d[0]
+ fmla v7.2d, v17.2d, v27.2d[0]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v8.2d, v20.2d, v24.2d[0]
+ fmla v9.2d, v21.2d, v24.2d[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v10.2d, v20.2d, v25.2d[0]
+ fmla v11.2d, v21.2d, v25.2d[0]
+ ldp q22, q23, [x12], #32
+ fmla v12.2d, v20.2d, v26.2d[0]
+ fmla v13.2d, v21.2d, v26.2d[0]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v14.2d, v20.2d, v27.2d[0]
+ fmla v15.2d, v21.2d, v27.2d[0]
+
+ // unroll 3
+ ldp d24, d25, [x11], #16
+ fmla v0.2d, v18.2d, v28.2d[0]
+ fmla v1.2d, v19.2d, v28.2d[0]
+ ldp d26, d27, [x11], #16
+ fmla v2.2d, v18.2d, v29.2d[0]
+ fmla v3.2d, v19.2d, v29.2d[0]
+ ldr q16, [x9], #16
+ fmla v4.2d, v18.2d, v30.2d[0]
+ fmla v5.2d, v19.2d, v30.2d[0]
+ ldr q17, [x9], #16
+ fmla v6.2d, v18.2d, v31.2d[0]
+ fmla v7.2d, v19.2d, v31.2d[0]
+ ldr q20, [x12], #16
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+ ldr q21, [x12], #16
+ fmla v10.2d, v22.2d, v29.2d[0]
+ fmla v11.2d, v23.2d, v29.2d[0]
+ cmp w8, #4
+ fmla v12.2d, v22.2d, v30.2d[0]
+ fmla v13.2d, v23.2d, v30.2d[0]
+ fmla v14.2d, v22.2d, v31.2d[0]
+ fmla v15.2d, v23.2d, v31.2d[0]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+
+ // unroll 0
+ ldp d28, d29, [x11], #16
+ fmla v0.2d, v16.2d, v24.2d[0]
+ fmla v1.2d, v17.2d, v24.2d[0]
+ ldp d30, d31, [x11], #16
+ fmla v2.2d, v16.2d, v25.2d[0]
+ fmla v3.2d, v17.2d, v25.2d[0]
+ ldr q18, [x9], #16
+ fmla v4.2d, v16.2d, v26.2d[0]
+ fmla v5.2d, v17.2d, v26.2d[0]
+ ldr q19, [x9], #16
+ fmla v6.2d, v16.2d, v27.2d[0]
+ fmla v7.2d, v17.2d, v27.2d[0]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v8.2d, v20.2d, v24.2d[0]
+ fmla v9.2d, v21.2d, v24.2d[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v10.2d, v20.2d, v25.2d[0]
+ fmla v11.2d, v21.2d, v25.2d[0]
+ ldp q22, q23, [x12], #32
+ fmla v12.2d, v20.2d, v26.2d[0]
+ fmla v13.2d, v21.2d, v26.2d[0]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v14.2d, v20.2d, v27.2d[0]
+ fmla v15.2d, v21.2d, v27.2d[0]
+
+ // unroll 1
+ ldp d24, d25, [x11], #16
+ fmla v0.2d, v18.2d, v28.2d[0]
+ fmla v1.2d, v19.2d, v28.2d[0]
+ ldp d26, d27, [x11], #16
+ fmla v2.2d, v18.2d, v29.2d[0]
+ fmla v3.2d, v19.2d, v29.2d[0]
+ ldr q16, [x9], #16
+ fmla v4.2d, v18.2d, v30.2d[0]
+ fmla v5.2d, v19.2d, v30.2d[0]
+ ldr q17, [x9], #16
+ fmla v6.2d, v18.2d, v31.2d[0]
+ fmla v7.2d, v19.2d, v31.2d[0]
+ ldr q20, [x12], #16
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+ ldr q21, [x12], #16
+ fmla v10.2d, v22.2d, v29.2d[0]
+ fmla v11.2d, v23.2d, v29.2d[0]
+ sub w8, w8, #4
+ fmla v12.2d, v22.2d, v30.2d[0]
+ fmla v13.2d, v23.2d, v30.2d[0]
+ fmla v14.2d, v22.2d, v31.2d[0]
+ fmla v15.2d, v23.2d, v31.2d[0]
+
+ // unroll 2
+ ldp d28, d29, [x11], #16
+ fmla v0.2d, v16.2d, v24.2d[0]
+ fmla v1.2d, v17.2d, v24.2d[0]
+ ldp d30, d31, [x11], #16
+ fmla v2.2d, v16.2d, v25.2d[0]
+ fmla v3.2d, v17.2d, v25.2d[0]
+ ldr q18, [x9], #16
+ fmla v4.2d, v16.2d, v26.2d[0]
+ fmla v5.2d, v17.2d, v26.2d[0]
+ ldr q19, [x9], #16
+ fmla v6.2d, v16.2d, v27.2d[0]
+ fmla v7.2d, v17.2d, v27.2d[0]
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v8.2d, v20.2d, v24.2d[0]
+ fmla v9.2d, v21.2d, v24.2d[0]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v10.2d, v20.2d, v25.2d[0]
+ fmla v11.2d, v21.2d, v25.2d[0]
+ ldp q22, q23, [x12], #32
+ fmla v12.2d, v20.2d, v26.2d[0]
+ fmla v13.2d, v21.2d, v26.2d[0]
+// prfm PLDL1KEEP, [x12, #64]
+ fmla v14.2d, v20.2d, v27.2d[0]
+ fmla v15.2d, v21.2d, v27.2d[0]
+
+ // unroll 3
+// ldp d24, d25, [x11], #16
+ fmla v0.2d, v18.2d, v28.2d[0]
+ fmla v1.2d, v19.2d, v28.2d[0]
+// ldp d26, d27, [x11], #16
+ fmla v2.2d, v18.2d, v29.2d[0]
+ fmla v3.2d, v19.2d, v29.2d[0]
+// ldr q16, [x9], #16
+ fmla v4.2d, v18.2d, v30.2d[0]
+ fmla v5.2d, v19.2d, v30.2d[0]
+// ldr q17, [x9], #16
+ fmla v6.2d, v18.2d, v31.2d[0]
+ fmla v7.2d, v19.2d, v31.2d[0]
+// ldr q20, [x12], #16
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+// ldr q21, [x12], #16
+ fmla v10.2d, v22.2d, v29.2d[0]
+ fmla v11.2d, v23.2d, v29.2d[0]
+// cmp w8, #4
+ fmla v12.2d, v22.2d, v30.2d[0]
+ fmla v13.2d, v23.2d, v30.2d[0]
+ fmla v14.2d, v22.2d, v31.2d[0]
+ fmla v15.2d, v23.2d, v31.2d[0]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #32
+ sub x11, x11, #32
+ sub x12, x12, #32
+
+3: // clean1-up loop
+
+ // unroll 0
+ ld1 {v20.2d, v21.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x11], #32
+ fmla v0.2d, v20.2d, v28.2d[0]
+ fmla v1.2d, v21.2d, v28.2d[0]
+ fmla v2.2d, v20.2d, v28.2d[1]
+ fmla v3.2d, v21.2d, v28.2d[1]
+ fmla v4.2d, v20.2d, v29.2d[0]
+ fmla v5.2d, v21.2d, v29.2d[0]
+ fmla v6.2d, v20.2d, v29.2d[1]
+ fmla v7.2d, v21.2d, v29.2d[1]
+ ld1 {v22.2d, v23.2d}, [x12], #32
+ fmla v8.2d, v22.2d, v28.2d[0]
+ fmla v9.2d, v23.2d, v28.2d[0]
+ fmla v10.2d, v22.2d, v28.2d[1]
+ fmla v11.2d, v23.2d, v28.2d[1]
+ fmla v12.2d, v22.2d, v29.2d[0]
+ fmla v13.2d, v23.2d, v29.2d[0]
+ fmla v14.2d, v22.2d, v29.2d[1]
+ fmla v15.2d, v23.2d, v29.2d[1]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_8x4_lib4, .-inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_8X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_8x4_lib4, %function
+inner_scale_ab_8x4_lib4:
+#endif
+
+ ld1 {v28.2d}, [x8]
+
+ fmul v0.2d, v0.2d, v28.2d[0]
+ fmul v1.2d, v1.2d, v28.2d[0]
+ fmul v2.2d, v2.2d, v28.2d[0]
+ fmul v3.2d, v3.2d, v28.2d[0]
+ fmul v4.2d, v4.2d, v28.2d[0]
+ fmul v5.2d, v5.2d, v28.2d[0]
+ fmul v6.2d, v6.2d, v28.2d[0]
+ fmul v7.2d, v7.2d, v28.2d[0]
+ fmul v8.2d, v8.2d, v28.2d[0]
+ fmul v9.2d, v9.2d, v28.2d[0]
+ fmul v10.2d, v10.2d, v28.2d[0]
+ fmul v11.2d, v11.2d, v28.2d[0]
+ fmul v12.2d, v12.2d, v28.2d[0]
+ fmul v13.2d, v13.2d, v28.2d[0]
+ fmul v14.2d, v14.2d, v28.2d[0]
+ fmul v15.2d, v15.2d, v28.2d[0]
+
+ ld1 {v28.2d}, [x9]
+
+ add x12, x10, x11
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+ fmla v0.2d, v24.2d, v28.2d[0]
+ fmla v1.2d, v25.2d, v28.2d[0]
+ fmla v2.2d, v26.2d, v28.2d[0]
+ fmla v3.2d, v27.2d, v28.2d[0]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x10], #64
+ fmla v4.2d, v24.2d, v28.2d[0]
+ fmla v5.2d, v25.2d, v28.2d[0]
+ fmla v6.2d, v26.2d, v28.2d[0]
+ fmla v7.2d, v27.2d, v28.2d[0]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x12], #64
+ fmla v8.2d, v24.2d, v28.2d[0]
+ fmla v9.2d, v25.2d, v28.2d[0]
+ fmla v10.2d, v26.2d, v28.2d[0]
+ fmla v11.2d, v27.2d, v28.2d[0]
+
+ ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x12], #64
+ fmla v12.2d, v24.2d, v28.2d[0]
+ fmla v13.2d, v25.2d, v28.2d[0]
+ fmla v14.2d, v26.2d, v28.2d[0]
+ fmla v15.2d, v27.2d, v28.2d[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_8X4_LIB4
+#else
+ .align 4
+ .type inner_store_8x4_lib4, %function
+inner_store_8x4_lib4:
+#endif
+
+ add x10, x8, x9
+
+ st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x8], #64
+ st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x8], #64
+ st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x10], #64
+ st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x10], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
+// void kernel_dgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+ .align 4
+ .global kernel_dgemm_nt_8x4_lib4
+ .type kernel_dgemm_nt_8x4_lib4, %function
+kernel_dgemm_nt_8x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+ fmov d8, d0
+ fmov d9, d0
+ fmov d10, d0
+ fmov d11, d0
+ fmov d12, d0
+ fmov d13, d0
+ fmov d14, d0
+ fmov d15, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #5 // 32*sda
+ mov x11, x4 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x5 // beta
+ mov x10, x6 // C
+ mov w11, w7 // C
+ lsl w11, w11, #5 // 32*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+ bl inner_scale_ab_8x4_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 0)] // D
+ ldr w9, [sp, #(STACKSIZE + 8)] // sdd
+ lsl w9, w9, #5 // 32*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+ bl inner_store_8x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
diff --git a/kernel/armv8a/kernel_sgemm_12x4_lib4.S b/kernel/armv8a/kernel_sgemm_12x4_lib4.S
new file mode 100644
index 0000000..ab66cad
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_12x4_lib4.S
@@ -0,0 +1,512 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_12x4_lib4, %function
+inner_kernel_gemm_add_nt_12x4_lib4:
+#endif
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x12, x9, x10
+ add x13, x12, x10
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x12, #0]
+ prfm PLDL1KEEP, [x13, #0]
+
+ // preload
+ ld1 {v24.4s, v25.4s}, [x9], #32
+ ld1 {v28.4s, v29.4s}, [x11], #32
+ ld1 {v20.4s, v21.4s}, [x12], #32
+ ld1 {v16.4s, v17.4s}, [x13], #32
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x12, #32]
+ prfm PLDL1KEEP, [x13, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ ld1 {v26.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v27.4s}, [x9], #16
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v30.4s}, [x11], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ ld1 {v31.4s}, [x11], #16
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v22.4s}, [x12], #16
+ fmla v8.4s, v16.4s, v28.4s[0]
+ fmla v9.4s, v16.4s, v28.4s[1]
+ ld1 {v23.4s}, [x12], #16
+ fmla v10.4s, v16.4s, v28.4s[2]
+ fmla v11.4s, v16.4s, v28.4s[3]
+
+ // unroll 1
+ ld1 {v18.4s}, [x13], #16
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ ld1 {v19.4s}, [x13], #16
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v8.4s, v17.4s, v29.4s[0]
+ fmla v9.4s, v17.4s, v29.4s[1]
+ sub w8, w8, #4
+ fmla v10.4s, v17.4s, v29.4s[2]
+ fmla v11.4s, v17.4s, v29.4s[3]
+
+ // unroll 2
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v26.4s, v30.4s[0]
+ fmla v1.4s, v26.4s, v30.4s[1]
+ ld1 {v25.4s}, [x9], #16
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+ ld1 {v28.4s}, [x11], #16
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+ ld1 {v29.4s}, [x11], #16
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+ ld1 {v20.4s}, [x12], #16
+ fmla v8.4s, v18.4s, v30.4s[0]
+ fmla v9.4s, v18.4s, v30.4s[1]
+ ld1 {v21.4s}, [x12], #16
+ fmla v10.4s, v18.4s, v30.4s[2]
+ fmla v11.4s, v18.4s, v30.4s[3]
+
+ // unroll 3
+ ld1 {v16.4s}, [x13], #16
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ ld1 {v17.4s}, [x13], #16
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ cmp w8, #4
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+ fmla v8.4s, v19.4s, v31.4s[0]
+ fmla v9.4s, v19.4s, v31.4s[1]
+ fmla v10.4s, v19.4s, v31.4s[2]
+ fmla v11.4s, v19.4s, v31.4s[3]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ ld1 {v26.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v27.4s}, [x9], #16
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v30.4s}, [x11], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ ld1 {v31.4s}, [x11], #16
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v22.4s}, [x12], #16
+ fmla v8.4s, v16.4s, v28.4s[0]
+ fmla v9.4s, v16.4s, v28.4s[1]
+ ld1 {v23.4s}, [x12], #16
+ fmla v10.4s, v16.4s, v28.4s[2]
+ fmla v11.4s, v16.4s, v28.4s[3]
+
+ // unroll 1
+ ld1 {v18.4s}, [x13], #16
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ ld1 {v19.4s}, [x13], #16
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+// prfm PLDL1KEEP, [x12, #64]
+ fmla v8.4s, v17.4s, v29.4s[0]
+ fmla v9.4s, v17.4s, v29.4s[1]
+ sub w8, w8, #4
+ fmla v10.4s, v17.4s, v29.4s[2]
+ fmla v11.4s, v17.4s, v29.4s[3]
+
+ // unroll 2
+// ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v26.4s, v30.4s[0]
+ fmla v1.4s, v26.4s, v30.4s[1]
+// ld1 {v25.4s}, [x9], #16
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+// ld1 {v28.4s}, [x11], #16
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+// ld1 {v29.4s}, [x11], #16
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+// ld1 {v20.4s}, [x12], #16
+ fmla v8.4s, v18.4s, v30.4s[0]
+ fmla v9.4s, v18.4s, v30.4s[1]
+// ld1 {v21.4s}, [x12], #16
+ fmla v10.4s, v18.4s, v30.4s[2]
+ fmla v11.4s, v18.4s, v30.4s[3]
+
+ // unroll 3
+// ld1 {v16.4s}, [x13], #16
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+// ld1 {v17.4s}, [x13], #16
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ cmp w8, #4
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+ fmla v8.4s, v19.4s, v31.4s[0]
+ fmla v9.4s, v19.4s, v31.4s[1]
+ fmla v10.4s, v19.4s, v31.4s[2]
+ fmla v11.4s, v19.4s, v31.4s[3]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #32
+ sub x12, x12, #32
+ sub x11, x11, #32
+ sub x13, x13, #32
+
+3: // clean1-up loop
+
+ // unroll 0
+
+ ld1 {v28.4s}, [x11], #16
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v20.4s}, [x12], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v16.4s}, [x13], #16
+ fmla v8.4s, v16.4s, v28.4s[0]
+ fmla v9.4s, v16.4s, v28.4s[1]
+ fmla v10.4s, v16.4s, v28.4s[2]
+ fmla v11.4s, v16.4s, v28.4s[3]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_12x4_lib4, .-inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_12X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_12x4_lib4, %function
+inner_scale_ab_12x4_lib4:
+#endif
+
+ ld1 {v28.4s}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+ fmul v4.4s, v4.4s, v28.4s[0]
+ fmul v5.4s, v5.4s, v28.4s[0]
+ fmul v6.4s, v6.4s, v28.4s[0]
+ fmul v7.4s, v7.4s, v28.4s[0]
+ fmul v8.4s, v8.4s, v28.4s[0]
+ fmul v9.4s, v9.4s, v28.4s[0]
+ fmul v10.4s, v10.4s, v28.4s[0]
+ fmul v11.4s, v11.4s, v28.4s[0]
+
+ ld1 {v28.4s}, [x9]
+
+ add x12, x10, x11
+ add x13, x12, x11
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v4.4s, v24.4s, v28.4s[0]
+ fmla v5.4s, v25.4s, v28.4s[0]
+ fmla v6.4s, v26.4s, v28.4s[0]
+ fmla v7.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x13], #64
+ fmla v8.4s, v24.4s, v28.4s[0]
+ fmla v9.4s, v25.4s, v28.4s[0]
+ fmla v10.4s, v26.4s, v28.4s[0]
+ fmla v11.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_12x4_lib4, .-inner_scale_ab_12x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_12X4_LIB4
+#else
+ .align 4
+ .type inner_store_12x4_lib4, %function
+inner_store_12x4_lib4:
+#endif
+
+ add x10, x8, x9
+ add x11, x10, x9
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x11], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_12x4_lib4, .-inner_store_12x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
+// void kernel_sgemm_nt_12x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+ .align 4
+ .global kernel_sgemm_nt_12x4_lib4
+ .type kernel_sgemm_nt_12x4_lib4, %function
+kernel_sgemm_nt_12x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+ fmov d8, d0
+ fmov d9, d0
+ fmov d10, d0
+ fmov d11, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #4 // 16*sda
+ mov x11, x4 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_12X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_12x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x5 // beta
+ mov x10, x6 // C
+ mov w11, w7 // C
+ lsl w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+ bl inner_scale_ab_12x4_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 0)] // D
+ ldr w9, [sp, #(STACKSIZE + 8)] // sdd
+ lsl w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+ bl inner_store_12x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
+
+
diff --git a/kernel/armv8a/kernel_sgemm_16x4_lib4.S b/kernel/armv8a/kernel_sgemm_16x4_lib4.S
new file mode 100644
index 0000000..edc06ac
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_16x4_lib4.S
@@ -0,0 +1,600 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_16X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_16x4_lib4, %function
+inner_kernel_gemm_add_nt_16x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x12, x9, x10
+ add x13, x12, x10
+ add x14, x13, x10
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x12, #0]
+ prfm PLDL1KEEP, [x13, #0]
+ prfm PLDL1KEEP, [x14, #0]
+
+ // preload
+ ldp s24, s25, [x11], #8
+ ldp s26, s27, [x11], #8
+ ldr q16, [x9], #16
+ ldr q17, [x12], #16
+ ldr q18, [x13], #16
+ ldr q19, [x14], #16
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x12, #32]
+ prfm PLDL1KEEP, [x13, #32]
+ prfm PLDL1KEEP, [x14, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ ldp s28, s29, [x11], #8
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s30, s31, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q20, [x9], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ ldr q21, [x12], #16
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q22, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ ldr q23, [x14], #16
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+
+ // unroll 1
+ ldp s24, s25, [x11], #8
+ fmla v0.4s, v20.4s, v28.4s[0]
+ fmla v1.4s, v20.4s, v29.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v20.4s, v30.4s[0]
+ fmla v3.4s, v20.4s, v31.4s[0]
+ ldr q16, [x9], #16
+ fmla v4.4s, v21.4s, v28.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[0]
+ ldr q17, [x12], #16
+ fmla v6.4s, v21.4s, v30.4s[0]
+ fmla v7.4s, v21.4s, v31.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v22.4s, v28.4s[0]
+ fmla v9.4s, v22.4s, v29.4s[0]
+ ldr q19, [x14], #16
+ fmla v10.4s, v22.4s, v30.4s[0]
+ fmla v11.4s, v22.4s, v31.4s[0]
+ prfm PLDL1KEEP, [x13, #32]
+ fmla v12.4s, v23.4s, v28.4s[0]
+ fmla v13.4s, v23.4s, v29.4s[0]
+ prfm PLDL1KEEP, [x14, #32]
+ fmla v14.4s, v23.4s, v30.4s[0]
+ fmla v15.4s, v23.4s, v31.4s[0]
+
+ // unroll 2
+ ldp s28, s29, [x11], #8
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s30, s31, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q20, [x9], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ ldr q21, [x12], #16
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q22, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ ldr q23, [x14], #16
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+
+ // unroll 3
+ ldp s24, s25, [x11], #8
+ fmla v0.4s, v20.4s, v28.4s[0]
+ fmla v1.4s, v20.4s, v29.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v20.4s, v30.4s[0]
+ fmla v3.4s, v20.4s, v31.4s[0]
+ ldr q16, [x9], #16
+ fmla v4.4s, v21.4s, v28.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[0]
+ ldr q17, [x12], #16
+ fmla v6.4s, v21.4s, v30.4s[0]
+ fmla v7.4s, v21.4s, v31.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v22.4s, v28.4s[0]
+ fmla v9.4s, v22.4s, v29.4s[0]
+ ldr q19, [x14], #16
+ fmla v10.4s, v22.4s, v30.4s[0]
+ fmla v11.4s, v22.4s, v31.4s[0]
+ sub w8, w8, #4
+ fmla v12.4s, v23.4s, v28.4s[0]
+ fmla v13.4s, v23.4s, v29.4s[0]
+ cmp w8, #4
+ fmla v14.4s, v23.4s, v30.4s[0]
+ fmla v15.4s, v23.4s, v31.4s[0]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+
+ // unroll 0
+ ldp s28, s29, [x11], #8
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s30, s31, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q20, [x9], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ ldr q21, [x12], #16
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q22, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ ldr q23, [x14], #16
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+
+ // unroll 1
+ ldp s24, s25, [x11], #8
+ fmla v0.4s, v20.4s, v28.4s[0]
+ fmla v1.4s, v20.4s, v29.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v20.4s, v30.4s[0]
+ fmla v3.4s, v20.4s, v31.4s[0]
+ ldr q16, [x9], #16
+ fmla v4.4s, v21.4s, v28.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[0]
+ ldr q17, [x12], #16
+ fmla v6.4s, v21.4s, v30.4s[0]
+ fmla v7.4s, v21.4s, v31.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v22.4s, v28.4s[0]
+ fmla v9.4s, v22.4s, v29.4s[0]
+ ldr q19, [x14], #16
+ fmla v10.4s, v22.4s, v30.4s[0]
+ fmla v11.4s, v22.4s, v31.4s[0]
+// prfm PLDL1KEEP, [x12, #64]
+ fmla v12.4s, v23.4s, v28.4s[0]
+ fmla v13.4s, v23.4s, v29.4s[0]
+// prfm PLDL1KEEP, [x13, #64]
+ fmla v14.4s, v23.4s, v30.4s[0]
+ fmla v15.4s, v23.4s, v31.4s[0]
+
+ // unroll 2
+ ldp s28, s29, [x11], #8
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s30, s31, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q20, [x9], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ ldr q21, [x12], #16
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q22, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ ldr q23, [x14], #16
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+// prfm PLDL1KEEP, [x14, #64]
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+
+ // unroll 3
+ ldp s24, s25, [x11], #8
+ fmla v0.4s, v20.4s, v28.4s[0]
+ fmla v1.4s, v20.4s, v29.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v20.4s, v30.4s[0]
+ fmla v3.4s, v20.4s, v31.4s[0]
+ ldr q16, [x9], #16
+ fmla v4.4s, v21.4s, v28.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[0]
+ ldr q17, [x12], #16
+ fmla v6.4s, v21.4s, v30.4s[0]
+ fmla v7.4s, v21.4s, v31.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v22.4s, v28.4s[0]
+ fmla v9.4s, v22.4s, v29.4s[0]
+ ldr q19, [x14], #16
+ fmla v10.4s, v22.4s, v30.4s[0]
+ fmla v11.4s, v22.4s, v31.4s[0]
+// sub w8, w8, #4
+ fmla v12.4s, v23.4s, v28.4s[0]
+ fmla v13.4s, v23.4s, v29.4s[0]
+// cmp w8, #4
+ fmla v14.4s, v23.4s, v30.4s[0]
+ fmla v15.4s, v23.4s, v31.4s[0]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #16
+ sub x11, x11, #16
+ sub x12, x12, #16
+ sub x13, x13, #16
+ sub x14, x14, #16
+
+3: // clean1-up loop
+
+ // unroll 0
+ // TODO
+ ldp s24, s25, [x11], #8
+ ldr q16, [x9], #16
+ fmla v0.4s, v16.4s, v24.4s[0]
+ fmla v1.4s, v16.4s, v25.4s[0]
+ ldp s26, s27, [x11], #8
+ fmla v2.4s, v16.4s, v26.4s[0]
+ fmla v3.4s, v16.4s, v27.4s[0]
+ ldr q17, [x12], #16
+ fmla v4.4s, v17.4s, v24.4s[0]
+ fmla v5.4s, v17.4s, v25.4s[0]
+ fmla v6.4s, v17.4s, v26.4s[0]
+ fmla v7.4s, v17.4s, v27.4s[0]
+ ldr q18, [x13], #16
+ fmla v8.4s, v18.4s, v24.4s[0]
+ fmla v9.4s, v18.4s, v25.4s[0]
+ fmla v10.4s, v18.4s, v26.4s[0]
+ fmla v11.4s, v18.4s, v27.4s[0]
+ ldr q19, [x14], #16
+ fmla v12.4s, v19.4s, v24.4s[0]
+ fmla v13.4s, v19.4s, v25.4s[0]
+ fmla v14.4s, v19.4s, v26.4s[0]
+ fmla v15.4s, v19.4s, v27.4s[0]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_16x4_lib4, .-inner_kernel_gemm_add_nt_16x4_lib4
+#endif
+
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_16X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_16x4_lib4, %function
+inner_scale_ab_16x4_lib4:
+#endif
+
+ ld1 {v28.4s}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+ fmul v4.4s, v4.4s, v28.4s[0]
+ fmul v5.4s, v5.4s, v28.4s[0]
+ fmul v6.4s, v6.4s, v28.4s[0]
+ fmul v7.4s, v7.4s, v28.4s[0]
+ fmul v8.4s, v8.4s, v28.4s[0]
+ fmul v9.4s, v9.4s, v28.4s[0]
+ fmul v10.4s, v10.4s, v28.4s[0]
+ fmul v11.4s, v11.4s, v28.4s[0]
+ fmul v12.4s, v12.4s, v28.4s[0]
+ fmul v13.4s, v13.4s, v28.4s[0]
+ fmul v14.4s, v14.4s, v28.4s[0]
+ fmul v15.4s, v15.4s, v28.4s[0]
+
+ ld1 {v28.4s}, [x9]
+
+ add x12, x10, x11
+ add x13, x12, x11
+ add x14, x13, x11
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v4.4s, v24.4s, v28.4s[0]
+ fmla v5.4s, v25.4s, v28.4s[0]
+ fmla v6.4s, v26.4s, v28.4s[0]
+ fmla v7.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x13], #64
+ fmla v8.4s, v24.4s, v28.4s[0]
+ fmla v9.4s, v25.4s, v28.4s[0]
+ fmla v10.4s, v26.4s, v28.4s[0]
+ fmla v11.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x14], #64
+ fmla v12.4s, v24.4s, v28.4s[0]
+ fmla v13.4s, v25.4s, v28.4s[0]
+ fmla v14.4s, v26.4s, v28.4s[0]
+ fmla v15.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_16x4_lib4, .-inner_scale_ab_16x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_16X4_LIB4
+#else
+ .align 4
+ .type inner_store_16x4_lib4, %function
+inner_store_16x4_lib4:
+#endif
+
+ add x10, x8, x9
+ add x11, x10, x9
+ add x12, x11, x9
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x11], #64
+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x12], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_16x4_lib4, .-inner_store_16x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
+// void kernel_sgemm_nt_16x4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+
+ .align 4
+ .global kernel_sgemm_nt_16x4_lib4
+ .type kernel_sgemm_nt_16x4_lib4, %function
+kernel_sgemm_nt_16x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+ fmov d8, d0
+ fmov d9, d0
+ fmov d10, d0
+ fmov d11, d0
+ fmov d12, d0
+ fmov d13, d0
+ fmov d14, d0
+ fmov d15, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #4 // 16*sda
+ mov x11, x4 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_16x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x5 // beta
+ mov x10, x6 // C
+ mov w11, w7 // C
+ lsl w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB4
+#else
+ bl inner_scale_ab_16x4_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 0)] // D
+ ldr w9, [sp, #(STACKSIZE + 8)] // sdd
+ lsl w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB4
+#else
+ bl inner_store_16x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
diff --git a/kernel/armv8a/kernel_sgemm_4x4_lib4.S b/kernel/armv8a/kernel_sgemm_4x4_lib4.S
new file mode 100644
index 0000000..6d3850d
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_4x4_lib4.S
@@ -0,0 +1,354 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ add sp, sp, #-(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_4x4_lib4, %function
+inner_kernel_gemm_add_nt_4x4_lib4:
+#endif
+
+// TODO more aggressive preload of A !!!
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ // prefetch
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x10, #0]
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // preload
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+
+ // prefetch
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x10, #32]
+
+ // main loop
+1:
+
+
+ // unroll 0
+ fmla v0.4s, v24.4s, v28.4s[0]
+ ld1 {v26.2d, v27.2d}, [x9], #32
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v30.2d, v31.2d}, [x10], #32
+ fmla v2.4s, v24.4s, v28.4s[2]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ prfm PLDL1KEEP, [x10, #64]
+
+ // unroll 1
+ fmla v0.4s, v25.4s, v29.4s[0]
+ sub w8, w8, #4
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+
+ // unroll 2
+ fmla v0.4s, v26.4s, v30.4s[0]
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ fmla v1.4s, v26.4s, v30.4s[1]
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+
+ cmp w8, #4
+ bgt 1b
+
+ sub x9, x9, #32
+ sub x10, x10, #32
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+
+ // unroll 1
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+
+ // unroll 2
+ ld1 {v24.2d, v25.2d}, [x9], #32
+ ld1 {v28.2d, v29.2d}, [x10], #32
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+
+ sub w8, w8, #4
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+3: // clean1-up loop
+
+ // unroll 0
+ ld1 {v24.2d}, [x9], #16
+ ld1 {v28.2d}, [x10], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_4x4_lib4, .-inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_4x4_lib4, %function
+inner_scale_ab_4x4_lib4:
+#endif
+
+ ld1 {v28.2d}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+
+ ld1 {v28.2d}, [x9]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .align 4
+ .type inner_store_4x4_lib4, %function
+inner_store_4x4_lib4:
+#endif
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 x3 x4 x5 x6
+// void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+
+ .align 4
+ .global kernel_sgemm_nt_4x4_lib4
+ .type kernel_sgemm_nt_4x4_lib4, %function
+kernel_sgemm_nt_4x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+
+
+
+ // call inner kernel dgemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov x10, x3 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_4x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x4 // beta
+ mov x10, x5 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+ bl inner_scale_ab_4x4_lib4
+#endif
+
+
+
+ // store n
+ mov x8, x6
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+ bl inner_store_4x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
diff --git a/kernel/armv8a/kernel_sgemm_8x4_lib4.S b/kernel/armv8a/kernel_sgemm_8x4_lib4.S
new file mode 100644
index 0000000..016af72
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_8x4_lib4.S
@@ -0,0 +1,433 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_8x4_lib4, %function
+inner_kernel_gemm_add_nt_8x4_lib4:
+#endif
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x12, x9, x10
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x12, #0]
+
+ // preload
+ ld1 {v24.4s, v25.4s}, [x9], #32
+ ld1 {v28.4s, v29.4s}, [x11], #32
+ ld1 {v20.4s, v21.4s}, [x12], #32
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x12, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ fmla v0.4s, v24.4s, v28.4s[0]
+ ld1 {v26.4s, v27.4s}, [x9], #32
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v30.4s, v31.4s}, [x11], #32
+ fmla v2.4s, v24.4s, v28.4s[2]
+ ld1 {v22.4s, v23.4s}, [x12], #32
+ fmla v3.4s, v24.4s, v28.4s[3]
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v4.4s, v20.4s, v28.4s[0]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ prfm PLDL1KEEP, [x12, #64]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ sub w8, w8, #4
+
+ // unroll 1
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+ cmp w8, #4
+
+ // unroll 2
+ fmla v0.4s, v26.4s, v30.4s[0]
+ ld1 {v24.4s, v25.4s}, [x9], #32
+ fmla v1.4s, v26.4s, v30.4s[1]
+ ld1 {v28.4s, v29.4s}, [x11], #32
+ fmla v2.4s, v26.4s, v30.4s[2]
+ ld1 {v20.4s, v21.4s}, [x12], #32
+ fmla v3.4s, v26.4s, v30.4s[3]
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ fmla v0.4s, v24.4s, v28.4s[0]
+ ld1 {v26.4s, v27.4s}, [x9], #32
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v30.4s, v31.4s}, [x11], #32
+ fmla v2.4s, v24.4s, v28.4s[2]
+ ld1 {v22.4s, v23.4s}, [x12], #32
+ fmla v3.4s, v24.4s, v28.4s[3]
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v4.4s, v20.4s, v28.4s[0]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v5.4s, v20.4s, v28.4s[1]
+// prfm PLDL1KEEP, [x12, #64]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ sub w8, w8, #4
+
+ // unroll 1
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+// cmp w8, #4
+
+ // unroll 2
+ fmla v0.4s, v26.4s, v30.4s[0]
+// ld1 {v24.4s, v25.4s}, [x9], #32
+ fmla v1.4s, v26.4s, v30.4s[1]
+// ld1 {v28.4s, v29.4s}, [x11], #32
+ fmla v2.4s, v26.4s, v30.4s[2]
+// ld1 {v20.4s, v21.4s}, [x12], #32
+ fmla v3.4s, v26.4s, v30.4s[3]
+// ld1 {v16.4s, v17.4s}, [x13], #32
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #32
+ sub x12, x12, #32
+ sub x11, x11, #32
+
+3: // clean1-up loop
+
+ // unroll 0
+
+ ld1 {v28.4s}, [x11], #16
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v20.4s}, [x12], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_8x4_lib4, .-inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_8X4_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_8x4_lib4, %function
+inner_scale_ab_8x4_lib4:
+#endif
+
+ ld1 {v28.4s}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+ fmul v4.4s, v4.4s, v28.4s[0]
+ fmul v5.4s, v5.4s, v28.4s[0]
+ fmul v6.4s, v6.4s, v28.4s[0]
+ fmul v7.4s, v7.4s, v28.4s[0]
+
+ ld1 {v28.4s}, [x9]
+
+ add x12, x10, x11
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v4.4s, v24.4s, v28.4s[0]
+ fmla v5.4s, v25.4s, v28.4s[0]
+ fmla v6.4s, v26.4s, v28.4s[0]
+ fmla v7.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_8X4_LIB4
+#else
+ .align 4
+ .type inner_store_8x4_lib4, %function
+inner_store_8x4_lib4:
+#endif
+
+ add x10, x8, x9
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 x5 x6 w7 sp+0 sp+8
+// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+
+ .align 4
+ .global kernel_sgemm_nt_8x4_lib4
+ .type kernel_sgemm_nt_8x4_lib4, %function
+kernel_sgemm_nt_8x4_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #4 // 16*sda
+ mov x11, x4 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_8x4_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x5 // beta
+ mov x10, x6 // C
+ mov w11, w7 // C
+ lsl w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+ bl inner_scale_ab_8x4_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 0)] // D
+ ldr w9, [sp, #(STACKSIZE + 8)] // sdd
+ lsl w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+ bl inner_store_8x4_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
+
diff --git a/kernel/armv8a/kernel_sgemm_8x8_lib4.S b/kernel/armv8a/kernel_sgemm_8x8_lib4.S
new file mode 100644
index 0000000..6c8c090
--- /dev/null
+++ b/kernel/armv8a/kernel_sgemm_8x8_lib4.S
@@ -0,0 +1,565 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#define STACKSIZE 11*16
+#define PROLOGUE \
+ sub sp, sp, #(11 * 16); \
+ stp d8, d9, [sp, #(0 * 16)]; \
+ stp d10, d11, [sp, #(1 * 16)]; \
+ stp d12, d13, [sp, #(2 * 16)]; \
+ stp d14, d15, [sp, #(3 * 16)]; \
+ stp x18, x19, [sp, #(4 * 16)]; \
+ stp x20, x21, [sp, #(5 * 16)]; \
+ stp x22, x23, [sp, #(6 * 16)]; \
+ stp x24, x25, [sp, #(7 * 16)]; \
+ stp x26, x27, [sp, #(8 * 16)]; \
+ stp x28, x29, [sp, #(9 * 16)]; \
+ str x30, [sp, #(10 * 16)];
+#define EPILOGUE \
+ ldp d8, d9, [sp, #(0 * 16)]; \
+ ldp d10, d11, [sp, #(1 * 16)]; \
+ ldp d12, d13, [sp, #(2 * 16)]; \
+ ldp d14, d15, [sp, #(3 * 16)]; \
+ ldp x18, x19, [sp, #(4 * 16)]; \
+ ldp x20, x21, [sp, #(5 * 16)]; \
+ ldp x22, x23, [sp, #(6 * 16)]; \
+ ldp x24, x25, [sp, #(7 * 16)]; \
+ ldp x26, x27, [sp, #(8 * 16)]; \
+ ldp x28, x29, [sp, #(9 * 16)]; \
+ ldr x30, [sp, #(10 * 16)]; \
+ add sp, sp, #(11 * 16);
+
+
+
+
+
+ .text
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// w8 <- k
+// x9 <- A
+// x10 <- sda
+// x11 <- B
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
+#else
+ .align 4
+ .type inner_kernel_gemm_add_nt_8x8_lib4, %function
+inner_kernel_gemm_add_nt_8x8_lib4:
+#endif
+
+ // early return
+ cmp w8, #0
+ ble 2f // return
+
+ add x13, x9, x10
+ add x14, x11, x12
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #0]
+ prfm PLDL1KEEP, [x9, #0]
+ prfm PLDL1KEEP, [x13, #0]
+ prfm PLDL1KEEP, [x14, #0]
+
+ // preload
+ ld1 {v24.4s, v25.4s}, [x9], #32
+ ld1 {v28.4s, v29.4s}, [x11], #32
+ ld1 {v20.4s, v21.4s}, [x13], #32
+ ld1 {v16.4s, v17.4s}, [x14], #32
+
+ cmp w8, #4
+ ble 0f // consider clean up loop
+
+ // prefetch
+ prfm PLDL1KEEP, [x11, #32]
+ prfm PLDL1KEEP, [x9, #32]
+ prfm PLDL1KEEP, [x13, #32]
+ prfm PLDL1KEEP, [x14, #32]
+
+ // main loop
+1:
+
+ // unroll 0
+ ld1 {v26.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v27.4s}, [x9], #16
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v30.4s}, [x11], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ ld1 {v31.4s}, [x11], #16
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v22.4s}, [x13], #16
+ fmla v8.4s, v24.4s, v16.4s[0]
+ fmla v9.4s, v24.4s, v16.4s[1]
+ ld1 {v23.4s}, [x13], #16
+ fmla v10.4s, v24.4s, v16.4s[2]
+ fmla v11.4s, v24.4s, v16.4s[3]
+ ld1 {v18.4s}, [x14], #16
+ fmla v12.4s, v20.4s, v16.4s[0]
+ fmla v13.4s, v20.4s, v16.4s[1]
+ ld1 {v19.4s}, [x14], #16
+ fmla v14.4s, v20.4s, v16.4s[2]
+ fmla v15.4s, v20.4s, v16.4s[3]
+
+ // unroll 1
+ prfm PLDL1KEEP, [x11, #64]
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+ prfm PLDL1KEEP, [x9, #64]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+ prfm PLDL1KEEP, [x13, #64]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+ prfm PLDL1KEEP, [x14, #64]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+ sub w8, w8, #4
+ fmla v8.4s, v25.4s, v17.4s[0]
+ fmla v9.4s, v25.4s, v17.4s[1]
+ fmla v10.4s, v25.4s, v17.4s[2]
+ fmla v11.4s, v25.4s, v17.4s[3]
+ fmla v12.4s, v21.4s, v17.4s[0]
+ fmla v13.4s, v21.4s, v17.4s[1]
+ cmp w8, #4
+ fmla v14.4s, v21.4s, v17.4s[2]
+ fmla v15.4s, v21.4s, v17.4s[3]
+
+ // unroll 2
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v26.4s, v30.4s[0]
+ fmla v1.4s, v26.4s, v30.4s[1]
+ ld1 {v25.4s}, [x9], #16
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+ ld1 {v28.4s}, [x11], #16
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+ ld1 {v29.4s}, [x11], #16
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+ ld1 {v20.4s}, [x13], #16
+ fmla v8.4s, v26.4s, v18.4s[0]
+ fmla v9.4s, v26.4s, v18.4s[1]
+ ld1 {v21.4s}, [x13], #16
+ fmla v10.4s, v26.4s, v18.4s[2]
+ fmla v11.4s, v26.4s, v18.4s[3]
+ ld1 {v16.4s}, [x14], #16
+ fmla v12.4s, v22.4s, v18.4s[0]
+ fmla v13.4s, v22.4s, v18.4s[1]
+ ld1 {v17.4s}, [x14], #16
+ fmla v14.4s, v22.4s, v18.4s[2]
+ fmla v15.4s, v22.4s, v18.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+ fmla v8.4s, v27.4s, v19.4s[0]
+ fmla v9.4s, v27.4s, v19.4s[1]
+ fmla v10.4s, v27.4s, v19.4s[2]
+ fmla v11.4s, v27.4s, v19.4s[3]
+ fmla v12.4s, v23.4s, v19.4s[0]
+ fmla v13.4s, v23.4s, v19.4s[1]
+ fmla v14.4s, v23.4s, v19.4s[2]
+ fmla v15.4s, v23.4s, v19.4s[3]
+
+ bgt 1b
+
+0:
+
+ cmp w8, #3
+ ble 4f
+
+ // unroll 0
+ ld1 {v26.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ ld1 {v27.4s}, [x9], #16
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v30.4s}, [x11], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ ld1 {v31.4s}, [x11], #16
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v22.4s}, [x13], #16
+ fmla v8.4s, v24.4s, v16.4s[0]
+ fmla v9.4s, v24.4s, v16.4s[1]
+ ld1 {v23.4s}, [x13], #16
+ fmla v10.4s, v24.4s, v16.4s[2]
+ fmla v11.4s, v24.4s, v16.4s[3]
+ ld1 {v18.4s}, [x14], #16
+ fmla v12.4s, v20.4s, v16.4s[0]
+ fmla v13.4s, v20.4s, v16.4s[1]
+ ld1 {v19.4s}, [x14], #16
+ fmla v14.4s, v20.4s, v16.4s[2]
+ fmla v15.4s, v20.4s, v16.4s[3]
+
+ // unroll 1
+// prfm PLDL1KEEP, [x11, #64]
+ fmla v0.4s, v25.4s, v29.4s[0]
+ fmla v1.4s, v25.4s, v29.4s[1]
+// prfm PLDL1KEEP, [x9, #64]
+ fmla v2.4s, v25.4s, v29.4s[2]
+ fmla v3.4s, v25.4s, v29.4s[3]
+// prfm PLDL1KEEP, [x13, #64]
+ fmla v4.4s, v21.4s, v29.4s[0]
+ fmla v5.4s, v21.4s, v29.4s[1]
+// prfm PLDL1KEEP, [x14, #64]
+ fmla v6.4s, v21.4s, v29.4s[2]
+ fmla v7.4s, v21.4s, v29.4s[3]
+ sub w8, w8, #4
+ fmla v8.4s, v25.4s, v17.4s[0]
+ fmla v9.4s, v25.4s, v17.4s[1]
+ fmla v10.4s, v25.4s, v17.4s[2]
+ fmla v11.4s, v25.4s, v17.4s[3]
+ fmla v12.4s, v21.4s, v17.4s[0]
+ fmla v13.4s, v21.4s, v17.4s[1]
+ cmp w8, #4
+ fmla v14.4s, v21.4s, v17.4s[2]
+ fmla v15.4s, v21.4s, v17.4s[3]
+
+ // unroll 2
+// ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v26.4s, v30.4s[0]
+ fmla v1.4s, v26.4s, v30.4s[1]
+// ld1 {v25.4s}, [x9], #16
+ fmla v2.4s, v26.4s, v30.4s[2]
+ fmla v3.4s, v26.4s, v30.4s[3]
+// ld1 {v28.4s}, [x11], #16
+ fmla v4.4s, v22.4s, v30.4s[0]
+ fmla v5.4s, v22.4s, v30.4s[1]
+// ld1 {v29.4s}, [x11], #16
+ fmla v6.4s, v22.4s, v30.4s[2]
+ fmla v7.4s, v22.4s, v30.4s[3]
+// ld1 {v20.4s}, [x13], #16
+ fmla v8.4s, v26.4s, v18.4s[0]
+ fmla v9.4s, v26.4s, v18.4s[1]
+// ld1 {v21.4s}, [x13], #16
+ fmla v10.4s, v26.4s, v18.4s[2]
+ fmla v11.4s, v26.4s, v18.4s[3]
+// ld1 {v16.4s}, [x14], #16
+ fmla v12.4s, v22.4s, v18.4s[0]
+ fmla v13.4s, v22.4s, v18.4s[1]
+// ld1 {v17.4s}, [x14], #16
+ fmla v14.4s, v22.4s, v18.4s[2]
+ fmla v15.4s, v22.4s, v18.4s[3]
+
+ // unroll 3
+ fmla v0.4s, v27.4s, v31.4s[0]
+ fmla v1.4s, v27.4s, v31.4s[1]
+ fmla v2.4s, v27.4s, v31.4s[2]
+ fmla v3.4s, v27.4s, v31.4s[3]
+ fmla v4.4s, v23.4s, v31.4s[0]
+ fmla v5.4s, v23.4s, v31.4s[1]
+ fmla v6.4s, v23.4s, v31.4s[2]
+ fmla v7.4s, v23.4s, v31.4s[3]
+ fmla v8.4s, v27.4s, v19.4s[0]
+ fmla v9.4s, v27.4s, v19.4s[1]
+ fmla v10.4s, v27.4s, v19.4s[2]
+ fmla v11.4s, v27.4s, v19.4s[3]
+ fmla v12.4s, v23.4s, v19.4s[0]
+ fmla v13.4s, v23.4s, v19.4s[1]
+ fmla v14.4s, v23.4s, v19.4s[2]
+ fmla v15.4s, v23.4s, v19.4s[3]
+
+ b 2f // return
+
+4: // consider clean1-up loop
+
+ cmp w8, #0
+ ble 2f // return
+
+ sub x9, x9, #32
+ sub x13, x13, #32
+ sub x11, x11, #32
+ sub x14, x14, #32
+
+3: // clean1-up loop
+
+ // unroll 0
+
+ ld1 {v28.4s}, [x11], #16
+ ld1 {v24.4s}, [x9], #16
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v24.4s, v28.4s[1]
+ fmla v2.4s, v24.4s, v28.4s[2]
+ fmla v3.4s, v24.4s, v28.4s[3]
+ ld1 {v20.4s}, [x13], #16
+ fmla v4.4s, v20.4s, v28.4s[0]
+ fmla v5.4s, v20.4s, v28.4s[1]
+ fmla v6.4s, v20.4s, v28.4s[2]
+ fmla v7.4s, v20.4s, v28.4s[3]
+ ld1 {v16.4s}, [x14], #16
+ fmla v8.4s, v24.4s, v16.4s[0]
+ fmla v9.4s, v24.4s, v16.4s[1]
+ fmla v10.4s, v24.4s, v16.4s[2]
+ fmla v11.4s, v24.4s, v16.4s[3]
+ fmla v12.4s, v20.4s, v16.4s[0]
+ fmla v13.4s, v20.4s, v16.4s[1]
+ fmla v14.4s, v20.4s, v16.4s[2]
+ fmla v15.4s, v20.4s, v16.4s[3]
+
+ sub w8, w8, #1
+ cmp w8, #0
+ bgt 3b
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_kernel_gemm_add_nt_8x8_lib4, .-inner_kernel_gemm_add_nt_8x8_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- alpha
+// x9 <- beta
+// x10 <- C
+// x11 <- sdc
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_SCALE_AB_8X8_LIB4
+#else
+ .align 4
+ .type inner_scale_ab_8x8_lib4, %function
+inner_scale_ab_8x8_lib4:
+#endif
+
+ ld1 {v28.4s}, [x8]
+
+ fmul v0.4s, v0.4s, v28.4s[0]
+ fmul v1.4s, v1.4s, v28.4s[0]
+ fmul v2.4s, v2.4s, v28.4s[0]
+ fmul v3.4s, v3.4s, v28.4s[0]
+ fmul v4.4s, v4.4s, v28.4s[0]
+ fmul v5.4s, v5.4s, v28.4s[0]
+ fmul v6.4s, v6.4s, v28.4s[0]
+ fmul v7.4s, v7.4s, v28.4s[0]
+ fmul v8.4s, v8.4s, v28.4s[0]
+ fmul v9.4s, v9.4s, v28.4s[0]
+ fmul v10.4s, v10.4s, v28.4s[0]
+ fmul v11.4s, v11.4s, v28.4s[0]
+ fmul v12.4s, v12.4s, v28.4s[0]
+ fmul v13.4s, v13.4s, v28.4s[0]
+ fmul v14.4s, v14.4s, v28.4s[0]
+ fmul v15.4s, v15.4s, v28.4s[0]
+
+ ld1 {v28.4s}, [x9]
+
+ add x12, x10, x11
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v0.4s, v24.4s, v28.4s[0]
+ fmla v1.4s, v25.4s, v28.4s[0]
+ fmla v2.4s, v26.4s, v28.4s[0]
+ fmla v3.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v4.4s, v24.4s, v28.4s[0]
+ fmla v5.4s, v25.4s, v28.4s[0]
+ fmla v6.4s, v26.4s, v28.4s[0]
+ fmla v7.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
+ fmla v8.4s, v24.4s, v28.4s[0]
+ fmla v9.4s, v25.4s, v28.4s[0]
+ fmla v10.4s, v26.4s, v28.4s[0]
+ fmla v11.4s, v27.4s, v28.4s[0]
+
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x12], #64
+ fmla v12.4s, v24.4s, v28.4s[0]
+ fmla v13.4s, v25.4s, v28.4s[0]
+ fmla v14.4s, v26.4s, v28.4s[0]
+ fmla v15.4s, v27.4s, v28.4s[0]
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_scale_ab_8x8_lib4, .-inner_scale_ab_8x8_lib4
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// x8 <- D
+// x9 <- sdd
+//
+// output arguments:
+
+#if MACRO_LEVEL>=2
+ .macro INNER_STORE_8X8_LIB4
+#else
+ .align 4
+ .type inner_store_8x8_lib4, %function
+inner_store_8x8_lib4:
+#endif
+
+ add x10, x8, x9
+
+ st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x8], #64
+ st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
+ st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x8], #64
+ st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], #64
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+ .size inner_store_8x8_lib4, .-inner_store_8x8_lib4
+#endif
+
+
+
+
+
+// w0 x1 x2 w3 x4 w5 x6 x7 sp+0 sp+8 sp+16
+// void kernel_sgemm_nt_8x4_lib4(int kmax, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd)
+
+ .align 4
+ .global kernel_sgemm_nt_8x8_lib4
+ .type kernel_sgemm_nt_8x8_lib4, %function
+kernel_sgemm_nt_8x8_lib4:
+
+
+
+ PROLOGUE
+
+
+
+ // TODO zero the entire 128-bit register ???
+ fmov d0, xzr
+ fmov d1, d0
+ fmov d2, d0
+ fmov d3, d0
+ fmov d4, d0
+ fmov d5, d0
+ fmov d6, d0
+ fmov d7, d0
+ fmov d8, d0
+ fmov d9, d0
+ fmov d10, d0
+ fmov d11, d0
+ fmov d12, d0
+ fmov d13, d0
+ fmov d14, d0
+ fmov d15, d0
+
+
+
+ // call inner kernel gemm nt
+ mov w8, w0 // kmax
+ mov x9, x2 // A
+ mov w10, w3 // sda
+ lsl w10, w10, #4 // 16*sda
+ mov x11, x4 // B
+ mov w12, w5 // sdb
+ lsl w12, w12, #4 // 16*sdb
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB4
+#else
+ bl inner_kernel_gemm_add_nt_8x8_lib4
+#endif
+
+
+
+ // call inner blend for generic alpha and beta
+ mov x8, x1 // alpha
+ mov x9, x6 // beta
+ mov x10, x7 // C
+ ldr w11, [sp, #(STACKSIZE + 0)] // D
+ lsl w11, w11, #4 // 16*sdc
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB4
+#else
+ bl inner_scale_ab_8x8_lib4
+#endif
+
+
+
+ // store n
+ ldr x8, [sp, #(STACKSIZE + 8)] // D
+ ldr w9, [sp, #(STACKSIZE + 16)] // sdd
+ lsl w9, w9, #4 // 16*sdd
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB4
+#else
+ bl inner_store_8x8_lib4
+#endif
+
+
+
+ EPILOGUE
+
+ mov x0, #0
+
+ ret
+
+
+
+
diff --git a/kernel/avx/Makefile b/kernel/avx/Makefile
new file mode 100644
index 0000000..f260086
--- /dev/null
+++ b/kernel/avx/Makefile
@@ -0,0 +1,54 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_diag_lib8.o kernel_sgecp_lib8.o kernel_sgetr_lib8.o kernel_sgead_lib8.o kernel_sgesc_lib8.o kernel_sgemv_8_lib8.o kernel_sgemv_4_lib8.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgemm_8x4_lib4.o kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_12_lib4.o kernel_dgemv_8_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_6_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o kernel_dgebp_lib4.o
+OBJS += kernel_sgemm_16x4_lib8.o kernel_sgemm_8x8_lib8.o kernel_sgemm_8x4_lib8.o kernel_sgemm_diag_lib8.o kernel_sgecp_lib8.o kernel_sgetr_lib8.o kernel_sgead_lib8.o kernel_sgetr_lib8.o kernel_sgesc_lib8.o kernel_sgemv_8_lib8.o kernel_sgemv_4_lib8.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/avx/kernel_dgebp_lib4.S b/kernel/avx/kernel_dgebp_lib4.S
new file mode 100644
index 0000000..0e8581e
--- /dev/null
+++ b/kernel/avx/kernel_dgebp_lib4.S
@@ -0,0 +1,935 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger4_sub_8r_lib4(int k, double *A, int sda, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_8r_lib4
+ .type kernel_dger4_sub_8r_lib4, @function
+kernel_dger4_sub_8r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_8r_lib4
+_kernel_dger4_sub_8r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_8r_lib4
+ .def kernel_dger4_sub_8r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmovapd 0(%r11, %r12, 1), %ymm4
+ vmovapd 32(%r11, %r12, 1), %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm6
+ vmovapd 96(%r11, %r12, 1), %ymm7
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ vmovapd 32(%r14), %ymm8
+ vmovapd 32(%r14, %r15, 1), %ymm9
+ vbroadcastsd 32(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 48(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 56(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 32(%r14)
+ vmovapd %ymm9, 32(%r14, %r15, 1)
+
+ vmovapd 64(%r14), %ymm8
+ vmovapd 64(%r14, %r15, 1), %ymm9
+ vbroadcastsd 64(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 72(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 80(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 88(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 64(%r14)
+ vmovapd %ymm9, 64(%r14, %r15, 1)
+
+ vmovapd 96(%r14), %ymm8
+ vmovapd 96(%r14, %r15, 1), %ymm9
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -24(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, -32(%r14)
+ vmovapd %ymm9, -32(%r14, %r15, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_8r_lib4, .-kernel_dger4_sub_8r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dger4_sub_8_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_8r_vs_lib4
+ .type kernel_dger4_sub_8r_vs_lib4, @function
+kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_8r_vs_lib4
+_kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_8r_vs_lib4
+ .def kernel_dger4_sub_8r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+ movq ARG7, %rax // km
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ vcvtsi2sd %eax, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC01(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC01(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmaskmovpd 0(%r11, %r12, 1), %ymm15, %ymm4
+ vmaskmovpd 32(%r11, %r12, 1), %ymm15, %ymm5
+ vmaskmovpd 64(%r11, %r12, 1), %ymm15, %ymm6
+ vmaskmovpd 96(%r11, %r12, 1), %ymm15, %ymm7
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ vmovapd 32(%r14), %ymm8
+ vmovapd 32(%r14, %r15, 1), %ymm9
+ vbroadcastsd 32(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 48(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 56(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 32(%r14)
+ vmovapd %ymm9, 32(%r14, %r15, 1)
+
+ vmovapd 64(%r14), %ymm8
+ vmovapd 64(%r14, %r15, 1), %ymm9
+ vbroadcastsd 64(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 72(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 80(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 88(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 64(%r14)
+ vmovapd %ymm9, 64(%r14, %r15, 1)
+
+ vmovapd 96(%r14), %ymm8
+ vmovapd 96(%r14, %r15, 1), %ymm9
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -24(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, -32(%r14)
+ vmovapd %ymm9, -32(%r14, %r15, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm4, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm5, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm6, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm8, %ymm8
+ vmulpd %ymm7, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_8r_vs_lib4, .-kernel_dger4_sub_8r_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dger4_sub_4r_lib4(int n, double *A, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_4r_lib4
+ .type kernel_dger4_sub_4r_lib4, @function
+kernel_dger4_sub_4r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_4r_lib4
+_kernel_dger4_sub_4r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_4r_lib4
+ .def kernel_dger4_sub_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ vmovapd 32(%r13), %ymm4
+ vbroadcastsd 32(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 40(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 48(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 56(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 32(%r13)
+
+ vmovapd 64(%r13), %ymm4
+ vbroadcastsd 64(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 72(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 80(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 88(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 64(%r13)
+
+ vmovapd 96(%r13), %ymm4
+ vbroadcastsd 96(%r12), %ymm15
+ addq $128, %r12
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -24(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -8(%r12), %ymm15
+ addq $128, %r13
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ addq $32, %r12
+ addq $32, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_4r_lib4, .-kernel_dger4_sub_4r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger4_sub_4_vs_lib4(int n, double *A, double *B, double *C, int km)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_4r_vs_lib4
+ .type kernel_dger4_sub_4r_vs_lib4, @function
+kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_4r_vs_lib4
+_kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_4r_vs_lib4
+ .def kernel_dger4_sub_4r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+ movq ARG5, %r14
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC00(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC00(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ // load block from A
+ vmaskmovpd 0(%r11), %ymm15, %ymm0
+ vmaskmovpd 32(%r11), %ymm15, %ymm1
+ vmaskmovpd 64(%r11), %ymm15, %ymm2
+ vmaskmovpd 96(%r11), %ymm15, %ymm3
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ vmovapd 32(%r13), %ymm4
+ vbroadcastsd 32(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 40(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 48(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 56(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 32(%r13)
+
+ vmovapd 64(%r13), %ymm4
+ vbroadcastsd 64(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 72(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 80(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 88(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 64(%r13)
+
+ vmovapd 96(%r13), %ymm4
+ vbroadcastsd 96(%r12), %ymm15
+ addq $128, %r12
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -24(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd -8(%r12), %ymm15
+ addq $128, %r13
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ vmulpd %ymm1, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vmulpd %ymm2, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vmulpd %ymm3, %ymm15, %ymm14
+ vsubpd %ymm14, %ymm4, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ addq $32, %r12
+ addq $32, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_4r_vs_lib4, .-kernel_dger4_sub_4r_vs_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00:
+#elif defined(OS_MAC)
+LC00:
+ .align 5
+#endif
+ .double 0.5
+ .double 1.5
+ .double 2.5
+ .double 3.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01:
+#elif defined(OS_MAC)
+LC01:
+ .align 5
+#endif
+ .double 4.5
+ .double 5.5
+ .double 6.5
+ .double 7.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02:
+#elif defined(OS_MAC)
+LC02:
+ .align 5
+#endif
+ .double 8.5
+ .double 9.5
+ .double 10.5
+ .double 11.5
+
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
+
diff --git a/kernel/avx/kernel_dgemm_4x4_lib4.S b/kernel/avx/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..95ff6ea
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,9906 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r12), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+// vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+
+// cmpl $3, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r12
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ subl $1, %r10d
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r12), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+
+ // unroll 3
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ cmpl $4, %r10d
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+
+ // unroll 3
+// vmovapd 0(%r12), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+// cmpl $3, %r10d
+
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ addq $32, %r11
+
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ addq $32, %r12
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x4_lib4, @function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nn_4x4_lib4, @function
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nn_4x4_lib4, .-inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- B
+// r12 <- C
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- ?
+// r12 <- ?
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_4x4_lib4, @function
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r12), %ymm12
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 8(%r11), %ymm13
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, 0(%r12)
+
+ vmovapd 32(%r12), %ymm12
+ vbroadcastsd 32(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 40(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 48(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 56(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, 32(%r12)
+
+ vmovapd 64(%r12), %ymm12
+ vbroadcastsd 64(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 72(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 80(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 88(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, 64(%r12)
+
+ vmovapd 96(%r12), %ymm12
+ vbroadcastsd 96(%r11), %ymm13
+ addq $128, %r11
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd -24(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd -16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd -8(%r11), %ymm13
+ addq $128, %r12
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, -32(%r12)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r12), %ymm12
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmovapd %ymm12, 0(%r12)
+
+ addq $32, %r11
+ addq $32, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_4x4_lib4, .-inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x4_lib4, @function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %ymm8
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r10), %ymm8
+ vbroadcastsd 32(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 40(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r10), %ymm8
+ vbroadcastsd 64(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 72(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 80(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r10), %ymm8
+ vbroadcastsd 96(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 104(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 112(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 120(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r10
+ addq $128, %r11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r11
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r11
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_4x4_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jg 0f
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r14d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r12 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A+3*bs*sizeof(double)
+ addq %r13, %r12
+ subq $8, %r12 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r14d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r12 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A+2*bs*sizeof(double)
+ addq %r13, %r12
+ subq $16, %r12 // B+bs*sdb*sizeof(double)-2
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r12 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-3
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_4x4_lib4, .-inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_4x4_gen_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ cmpl $0, %r14d
+ jg 0f // offB>0
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+0:
+ cmpl $1, %r14d
+ jg 1f // offB>1
+
+ // offB==1
+
+ addq $8, %r12 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+1:
+ cmpl $2, %r14d
+ jg 2f // offB>2
+
+ // offB==2
+
+ addq $16, %r12 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-2
+ addq $32, %r11 // A+2*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r12 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-4
+ addq $32, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_4x4_gen_lib4, .-inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dlauum_nt_4x4_lib4, @function
+inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r10), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 32(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 40(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r10), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 64(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 72(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 80(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r10), %ymm8
+ vbroadcastsd 96(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 104(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 112(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 120(%r11), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r10
+ addq $128, %r11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dlauum_nt_4x4_lib4, .-inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dlauum_nt_4x4_vs_lib4, @function
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ addq $32, %r11
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dlauum_nt_4x4_vs_lib4, .-inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif
+#endif
+
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_gen_lib4, @function
+inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmovapd 32(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmovapd 96(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+
+ jmp 3f
+
+0:
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_gen_lib4, .-inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_4x4_lib4, @function
+inner_scale_a0_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_4x4_lib4, .-inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_4x4_lib4, @function
+inner_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_4x4_lib4, .-inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_gen_lib4, @function
+inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmovapd 32(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmovapd 96(%r13), %ymm12
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+
+ jmp 3f
+
+0:
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm3, %ymm13, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_gen_lib4, .-inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender_loader for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif
+#endif
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_lib4, @function
+inner_edge_dpotrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_lib4, .-inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_4x4_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_4x4_lib4, .-inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ cmpl $3, %r11d
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ cmpl $4, %r11d
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_4x4_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_4x4_lib4, .-inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_run_inv_4x4_lib4, @function
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_run_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#endif
+#endif
+
+ // first column
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+ // second column
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+
+ // third column
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+
+ // fourth column
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_run_inv_4x4_lib4, .-inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lln_one_4x4_lib4, @function
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lln_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 32(%r10), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+ vmovapd 64(%r10), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lln_one_4x4_lib4, .-inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_4x4_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r11), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vbroadcastsd 0(%r11), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_4x4_lib4, .-inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r11), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_4x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_4x4_lib4, @function
+inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_4x4_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+ vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm0, %ymm12, %ymm12
+ vmovapd %ymm0, %ymm12
+ vmovddup %xmm0, %xmm13
+ vdivpd %xmm13, %xmm14, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r10)
+ vmulpd %ymm0, %ymm13, %ymm0
+ vblendpd $0x1, %ymm12, %ymm0, %ymm0
+
+ // second column
+ vmovddup %xmm1, %xmm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vblendpd $0x2, %ymm1, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vdivpd %xmm13, %xmm14, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r10)
+ vmulpd %ymm1, %ymm13, %ymm1
+ vblendpd $0x3, %ymm12, %ymm1, %ymm1
+
+ // third column
+ vmovddup %xmm2, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vblendpd $0x2, %ymm2, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm2, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vblendpd $0x4, %ymm2, %ymm12, %ymm12
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vmovddup %xmm13, %xmm13
+ vdivpd %xmm13, %xmm14, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r10)
+ vmulpd %ymm2, %ymm13, %ymm2
+ vblendpd $0x7, %ymm12, %ymm2, %ymm2
+
+ // fourth column
+ vmovddup %xmm3, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vblendpd $0x2, %ymm3, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm3, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vblendpd $0x4, %ymm3, %ymm12, %ymm12
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vblendpd $0x8, %ymm3, %ymm12, %ymm12
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vdivpd %xmm13, %xmm14, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r10)
+// vmulpd %ymm3, %ymm13, %ymm3
+ vblendpd $0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_4x4_lib4, .-inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ cmpl $2, %r12d
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ jl 0f // end
+ cmpl $3, %r12d
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ jl 0f // end
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_gen_lib4, @function
+inner_store_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm12, %ymm15
+ vandpd %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ vmaskmovpd %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ vmaskmovpd %ymm2, %ymm15, 64(%r11)
+ je 3f // end
+ vmaskmovpd %ymm3, %ymm15, 96(%r11)
+
+ jmp 3f
+
+0:
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%rbx)
+ jl 3f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%rbx)
+ jl 3f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%rbx)
+ je 3f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%rbx)
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm1
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm2
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%rbx)
+ jl 3f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%rbx)
+ jl 3f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%rbx)
+ je 3f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%rbx)
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm12, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm12, %ymm1, %ymm1
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm12, %ymm2, %ymm2
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm12, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%rbx)
+ jl 3f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%rbx)
+ jl 3f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%rbx)
+ je 3f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%rbx)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_gen_lib4, .-inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_gen_lib4, @function
+inner_store_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm12, %ymm15
+ vandpd %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm2, %ymm15, 64(%r11)
+ je 3f // end
+ vblendpd $0x4, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm3, %ymm15, 96(%r11)
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm12, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm12, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm12, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm12, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x2, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_gen_lib4, .-inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .type kernel_dgemm_nt_4x4_gen_lib4, @function
+kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_gen_lib4
+_kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .def kernel_dgemm_nt_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_gen_lib4, .-kernel_dgemm_nt_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nn_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .type kernel_dgemm_nn_4x4_lib4, @function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .def kernel_dgemm_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_dgemm_nn_4x4_gen_lib4(int k, double *alpha, double *A, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_gen_lib4
+ .type kernel_dgemm_nn_4x4_gen_lib4, @function
+kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_gen_lib4
+_kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_gen_lib4
+ .def kernel_dgemm_nn_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_gen_lib4, .-kernel_dgemm_nn_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dsyrk_nt_l_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_gen_lib4
+ .type kernel_dsyrk_nt_l_4x4_gen_lib4, @function
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_gen_lib4
+_kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_gen_lib4
+ .def kernel_dsyrk_nt_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_gen_lib4, .-kernel_dsyrk_nt_l_4x4_gen_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .type kernel_dtrmm_nn_rl_4x4_lib4, @function
+kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_4x4_lib4
+_kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .def kernel_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_4x4_lib4, .-kernel_dtrmm_nn_rl_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dtrmm_nn_rl_4x4_gen_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+ .type kernel_dtrmm_nn_rl_4x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_4x4_gen_lib4
+_kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+ .def kernel_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // offsetD
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG10, %r13 // m0
+ movq ARG11, %r14 // m1
+ movq ARG12, %r15 // n0
+ movq ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_4x4_gen_lib4, .-kernel_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10
+ movq ARG4, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9
+// void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_4x4_lib4
+ .type kernel_dtrsm_nt_rl_one_4x4_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_4x4_lib4
+_kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_4x4_lib4
+ .def kernel_dtrsm_nt_rl_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_4x4_lib4, .-kernel_dtrsm_nt_rl_one_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_one_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+ .type kernel_dtrsm_nt_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_4x4_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+ .def kernel_dtrsm_nt_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_4x4_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nn_ru_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+ .type kernel_dtrsm_nn_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_4x4_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+ .def kernel_dtrsm_nn_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_4x4_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nn_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nn_ll_one_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_4x4_lib4
+ .type kernel_dtrsm_nn_ll_one_4x4_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_4x4_lib4
+_kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_4x4_lib4
+ .def kernel_dtrsm_nn_ll_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_4x4_lib4, .-kernel_dtrsm_nn_ll_one_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+ .type kernel_dtrsm_nn_ll_one_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+ .def kernel_dtrsm_nn_ll_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_4x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nn_lu_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+ .type kernel_dtrsm_nn_lu_inv_4x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_4x4_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+ .def kernel_dtrsm_nn_lu_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_4x4_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nn_lu_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+ movq ARG9, %r12 // km
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4 // TODO
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dgetrf_nn_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_4x4_lib4
+ .type kernel_dgetrf_nn_4x4_lib4, @function
+kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_4x4_lib4
+_kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_4x4_lib4
+ .def kernel_dgetrf_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG7, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_4x4_lib4, .-kernel_dgetrf_nn_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgetrf_nn_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_4x4_vs_lib4
+ .type kernel_dgetrf_nn_4x4_vs_lib4, @function
+kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_4x4_vs_lib4
+_kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_4x4_vs_lib4
+ .def kernel_dgetrf_nn_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG7, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_4x4_vs_lib4, .-kernel_dgetrf_nn_4x4_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dlauum_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlauum_nt_4x4_lib4
+ .type kernel_dlauum_nt_4x4_lib4, @function
+kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlauum_nt_4x4_lib4
+_kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlauum_nt_4x4_lib4
+ .def kernel_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dlauum_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dlauum_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlauum_nt_4x4_vs_lib4
+ .type kernel_dlauum_nt_4x4_vs_lib4, @function
+kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlauum_nt_4x4_vs_lib4
+_kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlauum_nt_4x4_vs_lib4
+ .def kernel_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dlauum_nt_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb4_r_4_lib4
+ .type kernel_dlarfb4_r_4_lib4, @function
+kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb4_r_4_lib4
+_kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb4_r_4_lib4
+ .def kernel_dlarfb4_r_4_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG2, %r12 // V
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG2, %r12 // V
+
+ //
+ vmovapd 0(%r11), %ymm12
+ vaddpd %ymm12, %ymm0, %ymm0
+ //
+ vmovapd 32(%r11), %ymm12
+ vaddpd %ymm12, %ymm1, %ymm1
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ //
+ vmovapd 64(%r11), %ymm12
+ vaddpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 64(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 72(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ //
+ vmovapd 96(%r11), %ymm12
+ vaddpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 96(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 104(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 112(%r12), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+
+ movq ARG3, %r10 // T
+
+ //
+ vbroadcastsd 120(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ //
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 80(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ //
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 40(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ //
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vbroadcastsd 0(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // V
+ movq ARG4, %r12 // D
+
+ //
+ vmovapd 0(%r12), %ymm12
+ vaddpd %ymm12, %ymm0, %ymm12
+ vmovapd %ymm12, 0(%r12)
+ //
+ vmovapd 32(%r12), %ymm12
+ vbroadcastsd 32(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vaddpd %ymm12, %ymm1, %ymm12
+ vmovapd %ymm12, 32(%r12)
+ //
+ vmovapd 64(%r12), %ymm12
+ vbroadcastsd 64(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 72(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vaddpd %ymm12, %ymm2, %ymm12
+ vmovapd %ymm12, 64(%r12)
+ //
+ vmovapd 96(%r12), %ymm12
+ vbroadcastsd 96(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 104(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vbroadcastsd 112(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vaddpd %ymm12, %ymm3, %ymm12
+ vmovapd %ymm12, 96(%r12)
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb4_r_4_lib4, .-kernel_dlarfb4_r_4_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { -1 -1 -1 1 }
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { -1 -1 -1 -1 }
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 3.5 2.5 1.5 0.5 }
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 7.5 6.5 5.5 4.5 }
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC04: // { 1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemm_8x4_lib4.S b/kernel/avx/kernel_dgemm_8x4_lib4.S
new file mode 100644
index 0000000..e9f1f34
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_8x4_lib4.S
@@ -0,0 +1,13154 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_8x4_lib4, @function
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+// movq %r11, %r15 // A1 <- A0
+// addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+// vmovapd 0(%r15), %ymm9 // A1[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmovapd 0(%r13), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 32(%r15), %ymm11 // A1[4]
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 64(%r15), %ymm9 // A1[8]
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 96(%r15), %ymm11 // A1[12]
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ addq $128, %r11
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $128, %r15
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+
+ // unroll 3
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 0(%r15), %ymm9 // A1[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 32(%r15), %ymm11 // A1[4]
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 64(%r15), %ymm9 // A1[8]
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 96(%r15), %ymm11 // A1[12]
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+// addq $128, %r15
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+
+ // unroll 3
+// vmovapd 0(%r13), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+// vmovapd 0(%r15), %ymm9 // A1[0]
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+
+// cmpl $3, %r10d
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+// vmovapd 0(%r15), %ymm9 // A1[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ addq $32, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r13
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// addq $32, %r15
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ subl $1, %r10d
+ vmulpd %ymm9, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_8x4_lib4, .-inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_8x4_lib4, @function
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmovapd 0(%r13), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+
+ // unroll 3
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ cmpl $4, %r10d
+
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+
+ // unroll 3
+// vmovapd 0(%r13), %ymm12 // B[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+// cmpl $3, %r10d
+
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm10, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vmulpd %ymm11, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmulpd %ymm10, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ addq $32, %r11
+
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ addq $32, %r13
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vmulpd %ymm8, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ subl $1, %r10d
+ vmulpd %ymm9, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_8x4_lib4, .-inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_8x4_lib4, @function
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq %r14, %r13
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ addq %r14, %r13
+ vmulpd %ymm11, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_8x4_lib4, .-inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nn_8x4_lib4, @function
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq %r14, %r13
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ addq %r14, %r13
+ vmulpd %ymm11, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nn_8x4_lib4, .-inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x8_lib4, @function
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+ prefetcht0 128(%r12, %r13, 2) // software prefetch
+ prefetcht0 192(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x8_lib4, .-inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- B
+// r12 <- C
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- ?
+// r12 <- ?
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_8x4_lib4, @function
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 8(%r11), %ymm13
+ subl $4, %r10d
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vbroadcastsd 32(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 40(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 48(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 56(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vbroadcastsd 64(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 72(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 80(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 88(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vbroadcastsd 96(%r11), %ymm13
+ addq $128, %r11
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd -24(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd -16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd -8(%r11), %ymm13
+ addq $128, %r12
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, -32(%r12)
+ vmovapd %ymm14, -32(%r12, %r13, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm7, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+
+ addq $32, %r11
+ addq $32, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_8x4_lib4, .-inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_8x4_lib4, @function
+inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %ebx
+ subl %r15d, %ebx // 4-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,4-offsetB)
+
+ movl %r15d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r13 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A0+1*bs*sizeof(float)
+ addq $8, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_8x4_lib4, .-inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x8_lib4, @function
+inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vmulpd %ymm12, %ymm13, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x8_lib4, .-inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_8x4_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm12
+ vmovapd 0(%r10), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r10, %r11, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 32(%r10), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10, %r11, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 40(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ vbroadcastsd 64(%r12), %ymm12
+ vmovapd 64(%r10), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r10, %r11, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 72(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 80(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vbroadcastsd 96(%r12), %ymm12
+ vmovapd 96(%r10), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 96(%r10, %r11, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 104(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 112(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 120(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ addq $128, %r10
+ addq $128, %r12
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_8x4_lib4, .-inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_8x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ addq $32, %r11
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ addq $32, %r11
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ addq $32, %r11
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ addq $32, %r11
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ addq $32, %r13
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_8x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_8x4_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d
+ jg 0f
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r15d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A0+3*bs*sizeof(double)
+ addq %r14, %r13
+ subq $8, %r13 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r15d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A0+2*bs*sizeof(double)
+ addq %r14, %r13
+ subq $16, %r13 // B+bs*sdb*sizeof(double)-2
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 104(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-3
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 72(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 112(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_8x4_lib4, .-inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_8x4_gen_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_gen_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ cmpl $0, %r15d
+ jg 0f // offB>0
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+0:
+ cmpl $1, %r15d
+ jg 1f // offB>1
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+1:
+ cmpl $2, %r15d
+ jg 2f // offB>2
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-2
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm9, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-4
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_8x4_gen_lib4, .-inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_8x4_lib4, @function
+inner_blend_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_8x4_lib4:
+#endif
+#endif
+
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_8x4_lib4, .-inner_blend_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x4_lib4, @function
+inner_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_8x4_lib4:
+#endif
+#endif
+
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r10, %r11, 1), %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r10, %r11, 1), %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r10, %r11, 1), %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r10, %r11, 1), %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x4_lib4, .-inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_8x4_lib4, @function
+inner_scale_a0_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_8x4_lib4, .-inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_lib4, @function
+inner_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ // alg==1
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_gen_lib4, @function
+inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovapd 32(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+ vmovapd 64(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm2, %ymm14, %ymm2
+ vmovapd 96(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm3, %ymm14, %ymm3
+
+ vmovapd 0(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm4, %ymm14, %ymm4
+ vmovapd 32(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm5, %ymm14, %ymm5
+ vmovapd 64(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm6, %ymm14, %ymm6
+ vmovapd 96(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm7, %ymm14, %ymm7
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_gen_lib4, .-inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_lib4, @function
+inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ // alg==1
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r12, %r13, 1), %ymm15
+ vmulpd %ymm15, %ymm14, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_lib4, .-inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x8_lib4, @function
+inner_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x8_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmovapd 128(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x8_lib4, .-inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_lib4, @function
+inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib4:
+#endif
+#endif
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm0, %ymm1, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm2, %ymm3, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm12, %ymm14, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm13, %ymm15, %ymm3
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm4, %ymm5, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm6, %ymm7, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm12, %ymm14, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm13, %ymm15, %ymm7
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmovapd 128(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vmulpd %ymm14, %ymm15, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_lib4, .-inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_gen_lib4, @function
+inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovapd 32(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+ vmovapd 64(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm2, %ymm14, %ymm2
+ vmovapd 96(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm3, %ymm14, %ymm3
+
+ vmovapd 0(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm4, %ymm14, %ymm4
+ vmovapd 32(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm5, %ymm14, %ymm5
+ vmovapd 64(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm6, %ymm14, %ymm6
+ vmovapd 96(%r13, %r14, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm14
+ vaddpd %ymm7, %ymm14, %ymm7
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r13, %r14, 1), %ymm13
+ vmovapd 0(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm4, %ymm13, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%r13, %r14, 1), %ymm13
+ vmovapd 32(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm5, %ymm13, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r13, %r14, 1), %ymm13
+ vmovapd 64(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm6, %ymm13, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%r13, %r14, 1), %ymm13
+ vmovapd 96(%r13, %r14, 2), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vmulpd %ymm12, %ymm15, %ymm12
+ vaddpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm13, %ymm15, %ymm13
+ vaddpd %ymm7, %ymm13, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_gen_lib4, .-inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_lib4, @function
+inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib4:
+#endif
+#endif
+
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ // alg==1
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r10, %r11, 1), %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r10, %r11, 1), %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r10, %r11, 1), %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r10, %r11, 1), %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_lib4, .-inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_8x4_lib4, @function
+inner_edge_dpotrf_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_8x4_lib4, .-inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization vs
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_8x4_vs_lib4, @function
+inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $2, %r11d
+ jl 0f // ret
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $3, %r11d
+ jl 0f // ret
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $4, %r11d
+ jl 0f // ret
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilpd $0x3, %xmm13, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_8x4_vs_lib4, .-inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x4_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $2, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $3, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+ cmpl $4, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_8x4_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 8(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+
+ vbroadcastsd 16(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_8x4_lib4, .-inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ cmpl $3, %r11d
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm5, %ymm5
+
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ cmpl $4, %r11d
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 48(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm6, %ymm6
+
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 56(%r10), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 88(%r10), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm12
+ vsubpd %ymm12, %ymm7, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_8x4_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm5, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_8x4_lib4, .-inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm7, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm5, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_run_inv_8x4_lib4, @function
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_run_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#endif
+#endif
+
+ // first column
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+ // second column
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+
+ // third column
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+
+ // fourth column
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_run_inv_8x4_lib4, .-inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lln_one_8x4_lib4, @function
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lln_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#endif
+#endif
+
+ // solve top-left
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10), %ymm12
+ vxorpd %ymm14, %ymm14, %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 0(%r10, %r11, 1), %ymm14
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 32(%r10), %ymm12
+ vxorpd %ymm14, %ymm14, %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r10, %r11, 1), %ymm14
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 64(%r10), %ymm12
+ vxorpd %ymm14, %ymm14, %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 64(%r10, %r11, 1), %ymm14
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 96(%r10, %r11, 1), %ymm14
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm14, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ addq $128, %r10
+
+
+ // solve top-left
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10, %r11, 1), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 32(%r10, %r11, 1), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+ vmovapd 64(%r10, %r11, 1), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vmulpd %ymm12, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lln_one_8x4_lib4, .-inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_8x4_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#endif
+#endif
+
+ // bottom-right
+
+ vmovapd 224(%r10, %r11, 1), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm11
+
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r10, %r11, 1), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm11
+
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r10, %r11, 1), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm11
+
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm11
+
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_8x4_lib4, .-inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ // bottom-right
+
+ cmpl $7, %r13d
+ jle 0f
+
+ vmovapd 224(%r10, %r11, 1), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm11
+
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+0:
+ cmpl $6, %r13d
+ jle 1f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r10, %r11, 1), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm11
+
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+1:
+ cmpl $5, %r13d
+ jle 2f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r10, %r11, 1), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm11
+
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+2:
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm11
+
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vmulpd %ymm11, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vmulpd %ymm13, %ymm14, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_8x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// left kernel
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_l_8x4_lib4, @function
+inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_l_8x4_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+// vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm0, %ymm12, %ymm12
+ vmovapd %ymm0, %ymm12
+ vdivsd %xmm0, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r10)
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vblendpd $0x1, %ymm12, %ymm0, %ymm0
+
+ // second column
+// vpermpd $0x00, %ymm1, %ymm13
+ vmovddup %xmm1, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm5, %ymm5
+ vblendpd $0x2, %ymm1, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r10)
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vblendpd $0x3, %ymm12, %ymm1, %ymm1
+
+ // third column
+// vpermpd $0x00, %ymm2, %ymm13
+ vmovddup %xmm2, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vblendpd $0x2, %ymm2, %ymm13, %ymm12
+
+// vpermpd $0x55, %ymm2, %ymm13
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm6, %ymm6
+ vblendpd $0x4, %ymm2, %ymm12, %ymm12
+
+// vpermpd $0xaa, %ymm2, %ymm13
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm13
+ vpermilpd $0x0, %ymm13, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r10)
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vblendpd $0x7, %ymm12, %ymm2, %ymm2
+
+ // fourth column
+// vpermpd $0x00, %ymm3, %ymm13
+ vmovddup %xmm3, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vblendpd $0x2, %ymm3, %ymm13, %ymm12
+
+// vpermpd $0x55, %ymm3, %ymm13
+ vperm2f128 $0x00, %ymm3, %ymm3, %ymm13
+ vpermilpd $0xf, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vblendpd $0x4, %ymm3, %ymm12, %ymm12
+
+// vpermpd $0xaa, %ymm3, %ymm13
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm11
+ vpermilpd $0x0, %ymm11, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm13, %ymm15
+ vsubpd %ymm15, %ymm7, %ymm7
+ vblendpd $0x8, %ymm3, %ymm12, %ymm12
+
+// vpermpd $0xff, %ymm3, %ymm13
+// vperm2f128 $0x11, %ymm3, %ymm3, %ymm11
+ vpermilpd $0xf, %ymm11, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovddup %xmm13, %xmm13
+ vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r10)
+// vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vblendpd $0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_l_8x4_lib4, .-inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_lib4, @function
+inner_store_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_lib4, @function
+inner_store_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 128(%r10)
+ vmovapd %ymm5, 160(%r10)
+ vmovapd %ymm6, 192(%r10)
+ vmovapd %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_lib4, .-inner_store_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_vs_lib4, @function
+inner_store_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ cmpl $2, %r13d
+ vmovapd %ymm0, 0(%r10)
+ vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
+ jl 0f // end
+ cmpl $3, %r13d
+ vmovapd %ymm1, 32(%r10)
+ vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
+ jl 0f // end
+ vmovapd %ymm2, 64(%r10)
+ vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovapd %ymm3, 96(%r10)
+ vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_vs_lib4, .-inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_vs_lib4, @function
+inner_store_4x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+ vmaskmovpd %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmaskmovpd %ymm7, %ymm15, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_vs_lib4, .-inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_lib4, @function
+inner_store_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0,0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_lib4, .-inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_vs_lib4, @function
+inner_store_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ cmpl $2, %r13d
+ vmovapd %ymm0, 0(%r10)
+ vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
+ jl 0f // end
+ cmpl $3, %r13d
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
+ jl 0f // end
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+ vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_vs_lib4, .-inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_gen_lib4, @function
+inner_store_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+ vmovupd .LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+ vmovupd LC03(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm3, %ymm2
+ vmovapd %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm14, 0(%r11)
+ vmaskmovpd %ymm4, %ymm15, 0(%r11, %r12, 1)
+ jl 4f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm14, 32(%r11)
+ vmaskmovpd %ymm5, %ymm15, 32(%r11, %r12, 1)
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm14, 64(%r11)
+ vmaskmovpd %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm14, 96(%r11)
+ vmaskmovpd %ymm7, %ymm15, 96(%r11, %r12, 1)
+
+ jmp 4f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm12
+ vshufpd $0x5, %ymm4, %ymm12, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm12
+ vshufpd $0x5, %ymm5, %ymm12, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm12
+ vshufpd $0x5, %ymm6, %ymm12, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm12
+ vshufpd $0x5, %ymm7, %ymm12, %ymm7
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x1, %ymm14, %ymm15, %ymm14
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x3, %ymm14, %ymm15, %ymm14
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x21, %ymm0, %ymm4, %ymm12
+ vshufpd $0x5, %ymm12, %ymm4, %ymm0
+ vperm2f128 $0x21, %ymm4, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x21, %ymm1, %ymm5, %ymm12
+ vshufpd $0x5, %ymm12, %ymm5, %ymm1
+ vperm2f128 $0x21, %ymm5, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x21, %ymm2, %ymm6, %ymm12
+ vshufpd $0x5, %ymm12, %ymm6, %ymm2
+ vperm2f128 $0x21, %ymm6, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x21, %ymm3, %ymm7, %ymm12
+ vshufpd $0x5, %ymm12, %ymm7, %ymm3
+ vperm2f128 $0x21, %ymm7, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm12, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x7, %ymm14, %ymm15, %ymm14
+
+3:
+
+ cmpl $2, %r15d
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ jl 4f // end
+ cmpl $3, %r15d
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+4:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib4, .-inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_gen_lib4, @function
+inner_store_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+ vmovupd .LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+ vmovupd LC03(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm3, %ymm2
+ vmovapd %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm13
+#endif
+
+ vmaskmovpd %ymm0, %ymm14, 0(%r11)
+ vmaskmovpd %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm1, %ymm14, 32(%r11)
+ vmaskmovpd %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm2, %ymm14, 64(%r11)
+ vmaskmovpd %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x4, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm14, 96(%r11)
+ vmaskmovpd %ymm7, %ymm15, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm12
+ vshufpd $0x5, %ymm4, %ymm12, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm12
+ vshufpd $0x5, %ymm5, %ymm12, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm12
+ vshufpd $0x5, %ymm6, %ymm12, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm12
+ vshufpd $0x5, %ymm7, %ymm12, %ymm7
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x1, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x3, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x21, %ymm0, %ymm4, %ymm12
+ vshufpd $0x5, %ymm12, %ymm4, %ymm0
+ vperm2f128 $0x21, %ymm4, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x21, %ymm1, %ymm5, %ymm12
+ vshufpd $0x5, %ymm12, %ymm5, %ymm1
+ vperm2f128 $0x21, %ymm5, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x21, %ymm2, %ymm6, %ymm12
+ vshufpd $0x5, %ymm12, %ymm6, %ymm2
+ vperm2f128 $0x21, %ymm6, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x21, %ymm3, %ymm7, %ymm12
+ vshufpd $0x5, %ymm12, %ymm7, %ymm3
+ vperm2f128 $0x21, %ymm7, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm12, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x7, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x2, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_gen_lib4, .-inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemm_nt_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_lib4
+ .type kernel_dgemm_nt_8x4_lib4, @function
+kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_lib4
+_kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_lib4
+ .def kernel_dgemm_nt_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_lib4, .-kernel_dgemm_nt_8x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dgemm_nt_4x8_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x8_lib4
+ .type kernel_dgemm_nt_4x8_lib4, @function
+kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x8_lib4
+_kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x8_lib4
+ .def kernel_dgemm_nt_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x8_lib4, .-kernel_dgemm_nt_4x8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nt_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_vs_lib4
+ .type kernel_dgemm_nt_8x4_vs_lib4, @function
+kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_vs_lib4
+_kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_vs_lib4
+ .def kernel_dgemm_nt_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_vs_lib4, .-kernel_dgemm_nt_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemm_nt_4x8_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x8_vs_lib4
+ .type kernel_dgemm_nt_4x8_vs_lib4, @function
+kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x8_vs_lib4
+_kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x8_vs_lib4
+ .def kernel_dgemm_nt_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x8_vs_lib4, .-kernel_dgemm_nt_4x8_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dgemm_nt_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_gen_lib4
+ .type kernel_dgemm_nt_8x4_gen_lib4, @function
+kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_gen_lib4
+_kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_gen_lib4
+ .def kernel_dgemm_nt_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_gen_lib4, .-kernel_dgemm_nt_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nn_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_8x4_lib4
+ .type kernel_dgemm_nn_8x4_lib4, @function
+kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_8x4_lib4
+_kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_8x4_lib4
+ .def kernel_dgemm_nn_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_8x4_lib4, .-kernel_dgemm_nn_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nn_4x8_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x8_lib4
+ .type kernel_dgemm_nn_4x8_lib4, @function
+kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x8_lib4
+_kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x8_lib4
+ .def kernel_dgemm_nn_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x8_lib4, .-kernel_dgemm_nn_4x8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88 rsp+96
+// void kernel_dgemm_nn_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_8x4_gen_lib4
+ .type kernel_dgemm_nn_8x4_gen_lib4, @function
+kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_8x4_gen_lib4
+_kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_8x4_gen_lib4
+ .def kernel_dgemm_nn_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // offsetC
+ movq ARG10, %r13 // C
+ movq ARG11, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG12, %r10 // offsetD
+ movq ARG13, %r11 // D
+ movq ARG14, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG15, %r13 // m0
+ movq ARG16, %r14 // m1
+ movq ARG17, %r15 // n0
+ movq ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_8x4_gen_lib4, .-kernel_dgemm_nn_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dsyrk_nt_l_8x4_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_lib4
+ .type kernel_dsyrk_nt_l_8x4_lib4, @function
+kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_lib4
+_kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_lib4
+ .def kernel_dsyrk_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_lib4, .-kernel_dsyrk_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dsyrk_nt_l_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_vs_lib4
+ .type kernel_dsyrk_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_vs_lib4
+_kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_vs_lib4
+ .def kernel_dsyrk_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_vs_lib4, .-kernel_dsyrk_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dsyrk_nt_l_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_gen_lib4
+ .type kernel_dsyrk_nt_l_8x4_gen_lib4, @function
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_gen_lib4
+_kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_gen_lib4
+ .def kernel_dsyrk_nt_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_gen_lib4, .-kernel_dsyrk_nt_l_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nn_rl_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_8x4_lib4
+ .type kernel_dtrmm_nn_rl_8x4_lib4, @function
+kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_8x4_lib4
+_kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_8x4_lib4
+ .def kernel_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_8x4_lib4, .-kernel_dtrmm_nn_rl_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dtrmm_nn_rl_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+ .type kernel_dtrmm_nn_rl_8x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_8x4_gen_lib4
+_kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+ .def kernel_dtrmm_nn_rl_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // offsetD
+ movq ARG9, %r11 // D
+ movq ARG10, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG11, %r13 // m0
+ movq ARG12, %r14 // m1
+ movq ARG13, %r15 // n0
+ movq ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_8x4_gen_lib4, .-kernel_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_8x4_lib4
+ .type kernel_dtrmm_nt_ru_8x4_lib4, @function
+kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_8x4_lib4
+_kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_8x4_lib4
+ .def kernel_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10 // A
+ movq ARG4, %r11 // sda
+ sall $5, %r11d // 4*sda*sizeof(double)
+ movq ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_8x4_lib4, .-kernel_dtrmm_nt_ru_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_8x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_8x4_vs_lib4
+_kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+// store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_8x4_vs_lib4, .-kernel_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dpotrf_nt_l_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x4_lib4
+ .type kernel_dpotrf_nt_l_8x4_lib4, @function
+kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x4_lib4
+_kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x4_lib4
+ .def kernel_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x4_lib4, .-kernel_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dpotrf_nt_l_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x4_vs_lib4
+ .type kernel_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x4_vs_lib4
+ .def kernel_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dsyrk_dpotrf_nt_l_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_rl_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_one_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_8x4_lib4
+ .type kernel_dtrsm_nt_rl_one_8x4_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_8x4_lib4
+_kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_8x4_lib4
+ .def kernel_dtrsm_nt_rl_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_8x4_lib4, .-kernel_dtrsm_nt_rl_one_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_one_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+ .type kernel_dtrsm_nt_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_8x4_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+ .def kernel_dtrsm_nt_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_8x4_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nt_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+ .type kernel_dtrsm_nn_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_8x4_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+ .def kernel_dtrsm_nn_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_8x4_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nn_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ll_one_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_8x4_lib4
+ .type kernel_dtrsm_nn_ll_one_8x4_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_8x4_lib4
+_kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_8x4_lib4
+ .def kernel_dtrsm_nn_ll_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_8x4_lib4, .-kernel_dtrsm_nn_ll_one_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 tsp+56
+// void kernel_dtrsm_nn_ll_one_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+ .type kernel_dtrsm_nn_ll_one_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+ .def kernel_dtrsm_nn_ll_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_8x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nn_lu_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+ .type kernel_dtrsm_nn_lu_inv_8x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_8x4_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+ .def kernel_dtrsm_nn_lu_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_8x4_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nn_lu_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+ movq ARG13, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgetrf_nn_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_8x4_lib4
+ .type kernel_dgetrf_nn_l_8x4_lib4, @function
+kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_8x4_lib4
+_kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_8x4_lib4
+ .def kernel_dgetrf_nn_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ // epilogue
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_8x4_lib4, .-kernel_dgetrf_nn_l_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgetrf_nn_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_8x4_vs_lib4
+ .type kernel_dgetrf_nn_l_8x4_vs_lib4, @function
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_8x4_vs_lib4
+_kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_8x4_vs_lib4
+ .def kernel_dgetrf_nn_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_8x4_vs_lib4, .-kernel_dgetrf_nn_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dlarfb4_r_8_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb4_r_8_lib4
+ .type kernel_dlarfb4_r_8_lib4, @function
+kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb4_r_8_lib4
+_kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb4_r_8_lib4
+ .def kernel_dlarfb4_r_8_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG5, %r12 // sdd
+ sall $5, %r12d
+ movq ARG2, %r13 // V
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG5, %r12 // sdd
+ sall $5, %r12d
+ movq ARG2, %r13 // V
+
+ //
+ vmovapd 0(%r11), %ymm12
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vaddpd %ymm12, %ymm0, %ymm0
+ vaddpd %ymm14, %ymm4, %ymm4
+ //
+ vmovapd 32(%r11), %ymm12
+ vmovapd 32(%r11, %r12, 1), %ymm14
+ vaddpd %ymm12, %ymm1, %ymm1
+ vaddpd %ymm14, %ymm5, %ymm5
+ vbroadcastsd 32(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ //
+ vmovapd 64(%r11), %ymm12
+ vmovapd 64(%r11, %r12, 1), %ymm14
+ vaddpd %ymm12, %ymm2, %ymm2
+ vaddpd %ymm14, %ymm6, %ymm6
+ vbroadcastsd 64(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 72(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ //
+ vmovapd 96(%r11), %ymm12
+ vmovapd 96(%r11, %r12, 1), %ymm14
+ vaddpd %ymm12, %ymm3, %ymm3
+ vaddpd %ymm14, %ymm7, %ymm7
+ vbroadcastsd 96(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm4, %ymm4
+ vbroadcastsd 104(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 112(%r13), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm13, %ymm14, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+
+ movq ARG3, %r10 // T
+
+ //
+ vbroadcastsd 120(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ //
+ vbroadcastsd 112(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm6, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 80(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ //
+ vbroadcastsd 104(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm5, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 72(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm5, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 40(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ //
+ vbroadcastsd 96(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm3, %ymm3
+ vmulpd %ymm4, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 64(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm2, %ymm2
+ vmulpd %ymm4, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 32(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm4, %ymm12, %ymm15
+ vaddpd %ymm15, %ymm5, %ymm5
+ vbroadcastsd 0(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // V
+ movq ARG4, %r12 // D
+ movq ARG5, %r13 // sdd
+ sall $5, %r13d
+
+ //
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vaddpd %ymm12, %ymm0, %ymm12
+ vaddpd %ymm14, %ymm4, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+ //
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vbroadcastsd 32(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm12, %ymm1, %ymm12
+ vaddpd %ymm14, %ymm5, %ymm14
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+ //
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vbroadcastsd 64(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 72(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm12, %ymm2, %ymm12
+ vaddpd %ymm14, %ymm6, %ymm14
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+ //
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vbroadcastsd 96(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm4, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 104(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm5, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vbroadcastsd 112(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm12, %ymm12
+ vmulpd %ymm6, %ymm13, %ymm15
+ vaddpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm12, %ymm3, %ymm12
+ vaddpd %ymm14, %ymm7, %ymm14
+ vmovapd %ymm12, 96(%r12)
+ vmovapd %ymm14, 96(%r12, %r13, 1)
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb4_r_8_lib4, .-kernel_dlarfb4_r_8_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemm_diag_lib4.c b/kernel/avx/kernel_dgemm_diag_lib4.c
new file mode 100644
index 0000000..d64f977
--- /dev/null
+++ b/kernel/avx/kernel_dgemm_diag_lib4.c
@@ -0,0 +1,866 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+
+
+// B is the diagonal of a matrix, beta==0.0 case
+void kernel_dgemm_diag_right_4_a0_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22, b_33,
+ d_00, d_01, d_02, d_03;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+ b_00 = _mm256_mul_pd( b_00, alpha0 );
+ b_11 = _mm256_broadcast_sd( &B[1] );
+ b_11 = _mm256_mul_pd( b_11, alpha0 );
+ b_22 = _mm256_broadcast_sd( &B[2] );
+ b_22 = _mm256_mul_pd( b_22, alpha0 );
+ b_33 = _mm256_broadcast_sd( &B[3] );
+ b_33 = _mm256_mul_pd( b_33, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+ a_00 = _mm256_load_pd( &A[12] );
+ d_03 = _mm256_mul_pd( a_00, b_33 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+ _mm256_store_pd( &D[12], d_03 );
+
+ A += 4*sda;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+ a_00 = _mm256_load_pd( &A[12] );
+ d_03 = _mm256_mul_pd( a_00, b_33 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+ _mm256_maskstore_pd( &D[4], mask_i, d_01 );
+ _mm256_maskstore_pd( &D[8], mask_i, d_02 );
+ _mm256_maskstore_pd( &D[12], mask_i, d_03 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22, b_33,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+ b_00 = _mm256_mul_pd( b_00, alpha0 );
+ b_11 = _mm256_broadcast_sd( &B[1] );
+ b_11 = _mm256_mul_pd( b_11, alpha0 );
+ b_22 = _mm256_broadcast_sd( &B[2] );
+ b_22 = _mm256_mul_pd( b_22, alpha0 );
+ b_33 = _mm256_broadcast_sd( &B[3] );
+ b_33 = _mm256_mul_pd( b_33, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+ a_00 = _mm256_load_pd( &A[12] );
+ d_03 = _mm256_mul_pd( a_00, b_33 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+ c_00 = _mm256_load_pd( &C[12] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_03 = _mm256_add_pd( c_00, d_03 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+ _mm256_store_pd( &D[12], d_03 );
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+ a_00 = _mm256_load_pd( &A[12] );
+ d_03 = _mm256_mul_pd( a_00, b_33 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+ c_00 = _mm256_load_pd( &C[12] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_03 = _mm256_add_pd( c_00, d_03 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+ _mm256_maskstore_pd( &D[4], mask_i, d_01 );
+ _mm256_maskstore_pd( &D[8], mask_i, d_02 );
+ _mm256_maskstore_pd( &D[12], mask_i, d_03 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_3_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22,
+ c_00,
+ d_00, d_01, d_02;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+ b_00 = _mm256_mul_pd( b_00, alpha0 );
+ b_11 = _mm256_broadcast_sd( &B[1] );
+ b_11 = _mm256_mul_pd( b_11, alpha0 );
+ b_22 = _mm256_broadcast_sd( &B[2] );
+ b_22 = _mm256_mul_pd( b_22, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+ a_00 = _mm256_load_pd( &A[8] );
+ d_02 = _mm256_mul_pd( a_00, b_22 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+ _mm256_maskstore_pd( &D[4], mask_i, d_01 );
+ _mm256_maskstore_pd( &D[8], mask_i, d_02 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_2_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11,
+ c_00,
+ d_00, d_01;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+ b_00 = _mm256_mul_pd( b_00, alpha0 );
+ b_11 = _mm256_broadcast_sd( &B[1] );
+ b_11 = _mm256_mul_pd( b_11, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ a_00 = _mm256_load_pd( &A[4] );
+ d_01 = _mm256_mul_pd( a_00, b_11 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+ _mm256_maskstore_pd( &D[4], mask_i, d_01 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_dgemm_diag_right_1_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ b_00 = _mm256_broadcast_sd( &B[0] );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+
+ _mm256_store_pd( &D[0], d_00 );
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const double mask_f[] = {0.5, 1.5, 2.5, 3.5};
+ double m_f = kmax-k;
+
+ mask_i = _mm256_castpd_si256( _mm256_sub_pd( _mm256_loadu_pd( mask_f ), _mm256_broadcast_sd( &m_f ) ) );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+
+ _mm256_maskstore_pd( &D[0], mask_i, d_00 );
+
+ }
+
+ }
+
+
+
+// A is the diagonal of a matrix, beta=0.0 case
+void kernel_dgemm_diag_left_4_a0_lib4(int kmax, double *alpha, double *A, double *B, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0,
+ sign,
+ a_00,
+ b_00,
+ d_00, d_01, d_02, d_03;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ a_00 = _mm256_mul_pd( a_00, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[4] );
+ d_01 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[8] );
+ d_02 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[12] );
+ d_03 = _mm256_mul_pd( a_00, b_00 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+ _mm256_store_pd( &D[12], d_03 );
+
+ B += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ _mm256_store_pd( &D[0], d_00 );
+
+ B += 4;
+ D += 4;
+
+ }
+
+ }
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256d
+ alpha0, beta0,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ a_00 = _mm256_mul_pd( a_00, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[4] );
+ d_01 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[8] );
+ d_02 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[12] );
+ d_03 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+ c_00 = _mm256_load_pd( &C[12] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_03 = _mm256_add_pd( c_00, d_03 );
+
+ _mm256_store_pd( &D[0], d_00 );
+ _mm256_store_pd( &D[4], d_01 );
+ _mm256_store_pd( &D[8], d_02 );
+ _mm256_store_pd( &D[12], d_03 );
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+
+ _mm256_store_pd( &D[0], d_00 );
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_3_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256i
+ mask;
+
+ __m256d
+ alpha0, beta0,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ mask = _mm256_set_epi64x( 1, -1, -1, -1 );
+
+ alpha0 = _mm256_broadcast_sd( alpha );
+ beta0 = _mm256_broadcast_sd( beta );
+
+ a_00 = _mm256_load_pd( &A[0] );
+ a_00 = _mm256_mul_pd( a_00, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[4] );
+ d_01 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[8] );
+ d_02 = _mm256_mul_pd( a_00, b_00 );
+ b_00 = _mm256_load_pd( &B[12] );
+ d_03 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+ c_00 = _mm256_load_pd( &C[4] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_01 = _mm256_add_pd( c_00, d_01 );
+ c_00 = _mm256_load_pd( &C[8] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_02 = _mm256_add_pd( c_00, d_02 );
+ c_00 = _mm256_load_pd( &C[12] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_03 = _mm256_add_pd( c_00, d_03 );
+
+ _mm256_maskstore_pd( &D[0], mask, d_00 );
+ _mm256_maskstore_pd( &D[4], mask, d_01 );
+ _mm256_maskstore_pd( &D[8], mask, d_02 );
+ _mm256_maskstore_pd( &D[12], mask, d_03 );
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_00 = _mm256_load_pd( &B[0] );
+ d_00 = _mm256_mul_pd( a_00, b_00 );
+
+ c_00 = _mm256_load_pd( &C[0] );
+ c_00 = _mm256_mul_pd( c_00, beta0 );
+ d_00 = _mm256_add_pd( c_00, d_00 );
+
+ _mm256_maskstore_pd( &D[0], mask, d_00 );
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_2_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m128d
+ alpha0, beta0,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ alpha0 = _mm_loaddup_pd( alpha );
+ beta0 = _mm_loaddup_pd( beta );
+
+ a_00 = _mm_load_pd( &A[0] );
+ a_00 = _mm_mul_pd( a_00, alpha0 );
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_00 = _mm_load_pd( &B[0] );
+ d_00 = _mm_mul_pd( a_00, b_00 );
+ b_00 = _mm_load_pd( &B[4] );
+ d_01 = _mm_mul_pd( a_00, b_00 );
+ b_00 = _mm_load_pd( &B[8] );
+ d_02 = _mm_mul_pd( a_00, b_00 );
+ b_00 = _mm_load_pd( &B[12] );
+ d_03 = _mm_mul_pd( a_00, b_00 );
+
+ c_00 = _mm_load_pd( &C[0] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_00 = _mm_add_pd( c_00, d_00 );
+ c_00 = _mm_load_pd( &C[4] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_01 = _mm_add_pd( c_00, d_01 );
+ c_00 = _mm_load_pd( &C[8] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_02 = _mm_add_pd( c_00, d_02 );
+ c_00 = _mm_load_pd( &C[12] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_03 = _mm_add_pd( c_00, d_03 );
+
+ _mm_store_pd( &D[0], d_00 );
+ _mm_store_pd( &D[4], d_01 );
+ _mm_store_pd( &D[8], d_02 );
+ _mm_store_pd( &D[12], d_03 );
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_00 = _mm_load_pd( &B[0] );
+ d_00 = _mm_mul_pd( a_00, b_00 );
+
+ c_00 = _mm_load_pd( &C[0] );
+ c_00 = _mm_mul_pd( c_00, beta0 );
+ d_00 = _mm_add_pd( c_00, d_00 );
+
+ _mm_store_pd( &D[0], d_00 );
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+
+ }
+
+
+// A is the diagonal of a matrix
+void kernel_dgemm_diag_left_1_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0,
+ b_0,
+ c_0;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = A[0] * alpha0;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ b_0 = B[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+ D[0+bs*1] = c_0;
+
+
+ b_0 = B[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+ D[0+bs*2] = c_0;
+
+
+ b_0 = B[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+ D[0+bs*3] = c_0;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+
+
+
diff --git a/kernel/avx/kernel_dgemv_12_lib4.S b/kernel/avx/kernel_dgemv_12_lib4.S
new file mode 100644
index 0000000..c51ad9a
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_12_lib4.S
@@ -0,0 +1,1322 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z8 z9 za zb]_a
+// ymm3 <- [z0 z1 z2 z3]_b
+// ymm4 <- [z4 z5 z6 z7]_b
+// ymm5 <- [z8 z9 za zb]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x+k*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z8 z9 za zb]_a
+// ymm3 <- [z0 z1 z2 z3]_b
+// ymm4 <- [z4 z5 z6 z7]_b
+// ymm5 <- [z8 z9 za zb]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_N_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_n_12_lib4, @function
+inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_12_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r14 // A1 <- A0
+ addq %r12, %r14 // A1 <- A0 + 4*sda*sizeof(double)
+ movq %r14, %r15 // A2 <- A1
+ addq %r12, %r15 // A2 <- A1 + 4*sda*sizeof(double)
+
+ cmpl $4, %r10d
+
+ prefetcht0 0(%r11) // software prefetch
+ prefetcht0 0(%r14) // software prefetch
+ prefetcht0 0(%r15) // software prefetch
+ prefetcht0 64(%r11) // software prefetch
+ prefetcht0 64(%r14) // software prefetch
+ prefetcht0 64(%r15) // software prefetch
+
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 128(%r11) // software prefetch
+ prefetcht0 128(%r14) // software prefetch
+ prefetcht0 128(%r15) // software prefetch
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r14), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ subl $4, %r10d
+
+ vbroadcastsd 8(%r13), %ymm12
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmovapd 32(%r14), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ prefetcht0 192(%r11) // software prefetch
+ prefetcht0 192(%r14) // software prefetch
+ prefetcht0 192(%r15) // software prefetch
+
+ vbroadcastsd 16(%r13), %ymm12
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r14), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vbroadcastsd 24(%r13), %ymm12
+ addq $32, %r13 // x+4
+ vmovapd 96(%r11), %ymm8
+ addq $128, %r11 // A0+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmovapd 96(%r14), %ymm8
+ addq $128, %r14 // A1+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 96(%r15), %ymm8
+ addq $128, %r15 // A2+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r14), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ addq $32, %r11
+ addq $32, %r14
+ addq $32, %r15
+ addq $8, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_n_12_lib4, .-inner_kernel_dgemv_add_n_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- [z8a z8b z8c z8d]
+// ymm9 <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// r14 <- dirty
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- [z8a z8b z8c z8d]
+// ymm9 <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_T_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_t_12_lib4, @function
+inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_12_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+
+ prefetcht0 0(%r11) // software prefetch
+ prefetcht0 64(%r11) // software prefetch
+ prefetcht0 128(%r11) // software prefetch
+ prefetcht0 192(%r11) // software prefetch
+ prefetcht0 256(%r11) // software prefetch
+ prefetcht0 320(%r11) // software prefetch
+
+ jl 0f // clean-up loop
+
+ movq %r11, %r14
+ addq %r12, %r14 // A+bs*sda
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+
+ vmovupd 0(%r13), %ymm12
+ addq $32, %r13 // x+4
+
+ vmovapd 0(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ prefetcht0 64(%r14) // software prefetch
+
+ vmovapd 64(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ prefetcht0 128(%r14) // software prefetch
+
+ vmovapd 128(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vmovapd 160(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ prefetcht0 192(%r14) // software prefetch
+
+ vmovapd 192(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vmovapd 224(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ prefetcht0 256(%r14) // software prefetch
+
+ vmovapd 256(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm8, %ymm15, %ymm8
+
+ vmovapd 288(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm9, %ymm15, %ymm9
+
+ prefetcht0 320(%r14) // software prefetch
+
+ vmovapd 320(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm10, %ymm15, %ymm10
+
+ vmovapd 352(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm11, %ymm15, %ymm11
+
+// addq %r12, %r11 // A+bs*sda
+ movq %r14, %r11 // A+bs*sda
+ addq %r12, %r14 // A+bs*sda+bs*sda
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovapd 0(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 128(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vmovapd 160(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ vmovapd 192(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vmovapd 224(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ vmovapd 256(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm8, %ymm15, %ymm8
+
+ vmovapd 288(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm9, %ymm15, %ymm9
+
+ vmovapd 320(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm10, %ymm15, %ymm10
+
+ vmovapd 352(%r11), %ymm13
+ vmulpd %ymm13, %ymm12, %ymm15
+ vaddpd %ymm11, %ymm15, %ymm11
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_t_12_lib4, .-inner_kernel_dgemv_add_t_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z8 z9 za zb]_a
+// ymm3 <- [z0 z1 z2 z3]_b
+// ymm4 <- [z4 z5 z6 z7]_b
+// ymm5 <- [z8 z9 za zb]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_12_lib4, @function
+inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_12_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm1, %ymm4, %ymm1
+ vaddpd %ymm2, %ymm5, %ymm2
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+ vmovupd 64(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm2, %ymm14, %ymm2
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_12_lib4, .-inner_blend_n_scale_ab_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- [z8a z8b z8c z8d]
+// ymm9 <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_12_lib4, @function
+inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_12_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_12_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm9, %ymm8, %ymm8
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vhaddpd %ymm11, %ymm10, %ymm10
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x2, %ymm8, %ymm10, %ymm9
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vperm2f128 $0x13, %ymm8, %ymm10, %ymm8
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+ vaddpd %ymm8, %ymm9, %ymm2
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+ vmovupd 64(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm2, %ymm14, %ymm2
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_12_lib4, .-inner_blend_t_scale_ab_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==n
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z8 z9 za zb]_a
+// ymm3 <- [z0 z1 z2 z3]_b
+// ymm4 <- [z4 z5 z6 z7]_b
+// ymm5 <- [z8 z9 za zb]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_N_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_n_12_lib4, @function
+inner_blender_n_12_lib4:
+#elif defined(OS_MAC)
+_inner_blender_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_n_12_lib4; .scl 2; .type 32; .endef
+inner_blender_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_n_12_lib4; .scl 2; .type 32; .endef
+inner_blender_n_12_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm1, %ymm4, %ymm1
+ vaddpd %ymm2, %ymm5, %ymm2
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r11), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r11), %ymm15
+ vsubpd %ymm2, %ymm15, %ymm2
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_n_12_lib4, .-inner_blender_n_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==t
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- [z8a z8b z8c z8d]
+// ymm9 <- [z9a z9b z9c z9d]
+// ymm10 <- [zaa zab zac zad]
+// ymm11 <- [zba zbb zbc zbd]
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- [z8 z9 za zb]
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_T_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_t_12_lib4, @function
+inner_blender_t_12_lib4:
+#elif defined(OS_MAC)
+_inner_blender_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_t_12_lib4; .scl 2; .type 32; .endef
+inner_blender_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_t_12_lib4; .scl 2; .type 32; .endef
+inner_blender_t_12_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm9, %ymm8, %ymm8
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vhaddpd %ymm11, %ymm10, %ymm10
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x2, %ymm8, %ymm10, %ymm9
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vperm2f128 $0x13, %ymm8, %ymm10, %ymm8
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+ vaddpd %ymm8, %ymm9, %ymm2
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r11), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+ vmovupd 64(%r11), %ymm15
+ vsubpd %ymm2, %ymm15, %ymm2
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_t_12_lib4, .-inner_blender_t_12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_12_lib4, @function
+inner_store_12_lib4:
+#elif defined(OS_MAC)
+_inner_store_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_12_lib4; .scl 2; .type 32; .endef
+inner_store_12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_12_lib4; .scl 2; .type 32; .endef
+inner_store_12_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %ymm1, 32(%r10)
+ vmovupd %ymm2, 64(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_12_lib4, .-inner_store_12_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_n_12_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_12_lib4
+ .type kernel_dgemv_n_12_lib4, @function
+kernel_dgemv_n_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_12_lib4
+_kernel_dgemv_n_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_12_lib4
+ .def kernel_dgemv_n_12_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_12_lib4
+#endif
+#endif
+
+
+ // call inner blender n
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_12_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_12_lib4, .-kernel_dgemv_n_12_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_12_lib4
+ .type kernel_dgemv_t_12_lib4, @function
+kernel_dgemv_t_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_12_lib4
+_kernel_dgemv_t_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_12_lib4
+ .def kernel_dgemv_t_12_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_12_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_12_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_12_lib4, .-kernel_dgemv_t_12_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemv_4_lib4.S b/kernel/avx/kernel_dgemv_4_lib4.S
new file mode 100644
index 0000000..656e220
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_4_lib4.S
@@ -0,0 +1,4503 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- x
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- x+k*sizeof(double)
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_n_4_lib4, @function
+inner_kernel_dgemv_add_n_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r11
+ addq $32, %r12
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ addq $32, %r11
+ addq $8, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_n_4_lib4, .-inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_t_4_lib4, @function
+inner_kernel_dgemv_add_t_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmaskmovpd 0(%r11), %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmaskmovpd 32(%r11), %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmaskmovpd 64(%r11), %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmaskmovpd 96(%r11), %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_t_4_lib4, .-inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_nt_4_lib4, @function
+inner_kernel_dgemv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_nt_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovapd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 64(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 96(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm14, %ymm9, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm11
+
+ vmaskmovpd 0(%r13), %ymm11, %ymm12
+ vmaskmovpd 0(%r14), %ymm11, %ymm13
+
+// vmovupd %ymm14, -32(%rsp) // spill mask to stack
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 0(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 32(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 64(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 96(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm14, %ymm9, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+// vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd %ymm13, %ymm11, 0(%r14)
+
+ sall $3, %r10d // *sizeof(double)
+ addq %r10, %r11
+ addq %r10, %r13
+ addq %r10, %r14
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_nt_4_lib4, .-inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <-
+// r11 <-
+// r12 <-
+// r13 <-
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DGEMV_ADD_T_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemv_add_t_4_lib4, @function
+inner_edge_dgemv_add_t_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemv_add_t_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemv_add_t_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemv_add_t_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jle 0f // return
+
+ movl %r14d, %r15d
+ sall $3, %r15d // offA*sizeof(double)
+
+ subq %r15, %r11 // A - offA
+ subq %r15, %r13 // x - offA
+
+ movl %r10d, %r15d // kmax
+ addl %r14d, %r15d // kmax + offA
+
+ vcvtsi2sd %r14d, %xmm14, %xmm14 // offA
+ vcvtsi2sd %r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm13, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+ vandpd %ymm15, %ymm14, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $32, %r13 // x + 4
+ addq %r12, %r11 // A + bs*sda
+
+ addl %r14d, %r10d
+ subl $4, %r10d // kmax - (4-offA)
+
+0: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemv_add_t_4_lib4, .-inner_edge_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dsymv_add_nt_4_lib4, @function
+inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dsymv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_lib4:
+#endif
+#endif
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovupd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd 32(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd 64(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd 96(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// vxorpd %ymm15, %ymm15, %ymm15
+// vblendpd $0x0, %ymm14, %ymm15, %ymm14
+// vmulpd %ymm14, %ymm9, %ymm15
+// vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ subq $4, %r10
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dsymv_add_nt_4_lib4, .-inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// r15 <- offA
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// r15 <- offA
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DSYMV_ADD_NT_4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dsymv_add_nt_4_gen_lib4, @function
+inner_edge_dsymv_add_nt_4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dsymv_add_nt_4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_gen_lib4:
+#endif
+#endif
+
+ movl $4, %eax
+ cmpl %eax, %r10d
+ jge 0f
+ movl %r10d, %eax
+0:
+ subl %r15d, %eax
+
+ vcvtsi2sd %eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm11
+
+ vmaskmovpd 0(%r13), %ymm11, %ymm12
+ vmaskmovpd 0(%r14), %ymm11, %ymm13
+
+ vmaskmovpd 0(%r11), %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmaskmovpd 32(%r11), %ymm11, %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmaskmovpd 64(%r11), %ymm11, %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmaskmovpd 96(%r11), %ymm11, %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// vxorpd %ymm15, %ymm15, %ymm15
+// vblendpd $0x0, %ymm14, %ymm15, %ymm14
+// vmulpd %ymm14, %ymm9, %ymm15
+// vaddpd %ymm13, %ymm15, %ymm13
+
+ vmaskmovpd %ymm13, %ymm11, 0(%r14)
+
+ subl %eax, %r10d
+
+ salq $3, %rax // *sizeof(double)
+ addq %rax, %r11
+ subq $32, %r11
+ addq %r12, %r11
+ addq %rax, %r13
+ addq %rax, %r14
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dsymv_add_nt_4_gen_lib4, .-inner_edge_dsymv_add_nt_4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n
+//
+// input arguments:
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_lib4, @function
+inner_blend_n_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_lib4; .scl 2; .type 32; .endef
+inner_blend_n_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm2, %ymm3, %ymm2
+ vaddpd %ymm0, %ymm2, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_lib4, .-inner_blend_n_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t
+//
+// input arguments:
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_lib4, @function
+inner_blend_t_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_lib4; .scl 2; .type 32; .endef
+inner_blend_t_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_lib4, .-inner_blend_t_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_4_lib4, @function
+inner_blend_n_scale_ab_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_4_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm2, %ymm3, %ymm2
+ vaddpd %ymm0, %ymm2, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_4_lib4, .-inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_m11_4_lib4, @function
+inner_blend_n_scale_m11_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_m11_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_m11_4_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_m11_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm2, %ymm3, %ymm2
+ vaddpd %ymm0, %ymm2, %ymm0
+
+ // beta
+ vmovupd 0(%r10), %ymm14
+ vsubpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_m11_4_lib4, .-inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_4_lib4, @function
+inner_blend_t_scale_ab_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_4_lib4, .-inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_a1_4_lib4, @function
+inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_a1_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vmovupd 0(%r11), %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_a1_4_lib4, .-inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_m11_4_lib4, @function
+inner_blend_t_scale_m11_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_m11_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ vmovupd 0(%r10), %ymm14
+ vsubpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_m11_4_lib4, .-inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LN_INV_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_ln_inv_4_lib4, @function
+inner_edge_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_ln_inv_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_ln_inv_4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+ vmovapd 0(%r10), %ymm13
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+
+ vmovapd 32(%r10), %ymm13
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x3, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x4, %ymm1, %ymm0, %ymm0
+
+ vmovapd 64(%r10), %ymm13
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_ln_inv_4_lib4, .-inner_edge_dtrsv_ln_inv_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS, variable size version
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LN_INV_4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_ln_inv_4_vs_lib4, @function
+inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_ln_inv_4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_ln_inv_4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+ vmovapd 0(%r10), %ymm13
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ cmpl $2, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+ vmovapd 32(%r10), %ymm13
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x3, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ cmpl $3, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x4, %ymm1, %ymm0, %ymm0
+ vmovapd 64(%r10), %ymm13
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm13, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+ cmpl $4, %r12d
+ jl 0f // ret
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm1
+ vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+ // return
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_ln_inv_4_vs_lib4, .-inner_edge_dtrsv_ln_inv_4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LT_INV_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_lt_inv_4_lib4, @function
+inner_edge_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_lt_inv_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 16(%r10), %xmm12
+ vmovapd 48(%r10), %xmm13
+ vunpcklpd %xmm13, %xmm12, %xmm9
+ vblendpd $0xc, %ymm14, %ymm9, %ymm9
+ vunpckhpd %xmm13, %xmm12, %xmm10
+ vmovsd 8(%r10), %xmm8
+ vblendpd $0xe, %ymm14, %ymm8, %ymm8
+ vmovsd 88(%r10), %xmm11
+ vinsertf128 $0x1, %xmm11, %ymm10, %ymm10
+ vblendpd $0x8, %ymm14, %ymm10, %ymm10
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0xf, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x4, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x3, %ymm0, %ymm12
+// vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vbroadcastsd 8(%r11), %ymm12
+ vmovsd 0(%r11), %xmm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_lt_inv_4_lib4, .-inner_edge_dtrsv_lt_inv_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LT_INV_3_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_lt_inv_3_lib4, @function
+inner_edge_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_lt_inv_3_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_3_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 16(%r10), %xmm12
+ vmovapd 48(%r10), %xmm13
+ vunpcklpd %xmm13, %xmm12, %xmm9
+ vblendpd $0xc, %ymm14, %ymm9, %ymm9
+ vunpckhpd %xmm13, %xmm12, %xmm10
+ vmovsd 8(%r10), %xmm8
+ vblendpd $0xe, %ymm14, %ymm8, %ymm8
+ vmovsd 88(%r10), %xmm11
+ vinsertf128 $0x1, %xmm11, %ymm10, %ymm10
+ vblendpd $0x8, %ymm14, %ymm10, %ymm10
+
+// vbroadcastsd 24(%r11), %ymm12
+// vmulpd %ymm12, %ymm0, %ymm1
+// vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+ vmovupd 0(%r13), %ymm12
+ vblendpd $0x8, %ymm12, %ymm0, %ymm0
+
+ cmpl $4, %r12d
+ jl 0f
+
+ vpermilpd $0xf, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+0:
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x4, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x3, %ymm0, %ymm12
+// vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vbroadcastsd 8(%r11), %ymm12
+ vmovsd 0(%r11), %xmm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_lt_inv_3_lib4, .-inner_edge_dtrsv_lt_inv_3_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LT_INV_2_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_lt_inv_2_lib4, @function
+inner_edge_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_lt_inv_2_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_2_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ cmpl $3, %r12d
+
+ vmovapd 16(%r10), %xmm12
+ vmovapd 48(%r10), %xmm13
+ vunpcklpd %xmm13, %xmm12, %xmm9
+ vblendpd $0xc, %ymm14, %ymm9, %ymm9
+ vunpckhpd %xmm13, %xmm12, %xmm10
+ vmovsd 8(%r10), %xmm8
+ vblendpd $0xe, %ymm14, %ymm8, %ymm8
+// vmovsd 88(%r10), %xmm11
+// vinsertf128 $0x1, %xmm11, %ymm10, %ymm10
+// vblendpd $0x8, %ymm14, %ymm10, %ymm10
+ vblendpd $0xc, %ymm14, %ymm10, %ymm10
+
+// vbroadcastsd 24(%r11), %ymm12
+// vmulpd %ymm12, %ymm0, %ymm1
+// vblendpd $0x8, %ymm1, %ymm0, %ymm0
+
+ vmovupd 0(%r13), %ymm12
+ vblendpd $0xc, %ymm12, %ymm0, %ymm0
+
+ je 0f
+ jl 1f
+
+ vpermilpd $0xf, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+0:
+
+// vbroadcastsd 16(%r11), %ymm12
+// vmulpd %ymm12, %ymm0, %ymm1
+// vblendpd $0x4, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+1:
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x2, %ymm1, %ymm0, %ymm0
+
+ vpermilpd $0x3, %ymm0, %ymm12
+// vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vbroadcastsd 8(%r11), %ymm12
+
+ vmovsd 0(%r11), %xmm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_lt_inv_2_lib4, .-inner_edge_dtrsv_lt_inv_2_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- k
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSV_LT_INV_1_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsv_lt_inv_1_lib4, @function
+inner_edge_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsv_lt_inv_1_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsv_lt_inv_1_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovupd 0(%r13), %ymm12
+ vblendpd $0xe, %ymm12, %ymm0, %ymm0
+
+ cmpl $3, %r12d
+ je 0f
+
+ cmpl $2, %r12d
+ je 1f
+ jl 2f
+
+ vmovsd 24(%r10), %xmm10
+ vblendpd $0xe, %ymm14, %ymm10, %ymm10
+ vpermilpd $0xf, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm10, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+0:
+
+ vmovsd 16(%r10), %xmm9
+ vblendpd $0xe, %ymm14, %ymm9, %ymm9
+ vpermilpd $0x0, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm9, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+
+1:
+
+ vmovsd 8(%r10), %xmm8
+ vblendpd $0xe, %ymm14, %ymm8, %ymm8
+ vpermilpd $0x3, %ymm0, %ymm12
+// vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vsubpd %ymm15, %ymm0, %ymm0
+// vbroadcastsd 8(%r11), %ymm12
+
+2:
+
+ vmovsd 0(%r11), %xmm12
+ vmulpd %ymm12, %ymm0, %ymm1
+ vblendpd $0x1, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsv_lt_inv_1_lib4, .-inner_edge_dtrsv_lt_inv_1_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- x
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k-4
+// r11 <- A+4*4*sizeof(double)
+// r12 <- x+4*sizeof(double)
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMV_UN_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmv_un_4_lib4, @function
+inner_edge_dtrmv_un_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmv_un_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmv_un_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmv_un_4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r11), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r11
+ addq $32, %r12
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmv_un_4_lib4, .-inner_edge_dtrmv_un_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dtrmv_ut_4_lib4, @function
+inner_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dtrmv_ut_4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dtrmv_ut_4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jle 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $4, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+// vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+// vmovupd LC02(%rip), %ymm13
+#endif
+// vmovddup %xmm14, %xmm14
+// vinsertf128 $1, %xmm14, %ymm14, %ymm14
+// vsubpd %ymm14, %ymm13, %ymm14
+//
+// vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovupd 0(%r13), %ymm12
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r11), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dtrmv_ut_4_lib4, .-inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_lib4, @function
+inner_store_4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_lib4; .scl 2; .type 32; .endef
+inner_store_4_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_lib4, .-inner_store_4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_vs_lib4, @function
+inner_store_4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_vs_lib4, .-inner_store_4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_gen_lib4, @function
+inner_store_4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r11d, %xmm14, %xmm14
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm12, %ymm15
+ vandpd %ymm14, %ymm15, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_gen_lib4, .-inner_store_4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dgemv_n_4_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_4_lib4
+ .type kernel_dgemv_n_4_lib4, @function
+kernel_dgemv_n_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_4_lib4
+_kernel_dgemv_n_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_4_lib4
+ .def kernel_dgemv_n_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_4_lib4, .-kernel_dgemv_n_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dgemv_n_4_vs_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_4_vs_lib4
+ .type kernel_dgemv_n_4_vs_lib4, @function
+kernel_dgemv_n_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_4_vs_lib4
+_kernel_dgemv_n_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_4_vs_lib4
+ .def kernel_dgemv_n_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_4_vs_lib4, .-kernel_dgemv_n_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dgemv_n_4_gen_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_4_gen_lib4
+ .type kernel_dgemv_n_4_gen_lib4, @function
+kernel_dgemv_n_4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_4_gen_lib4
+_kernel_dgemv_n_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_4_gen_lib4
+ .def kernel_dgemv_n_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // k0
+ movq ARG9, %r12 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_4_gen_lib4, .-kernel_dgemv_n_4_gen_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dgemv_t_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_4_lib4
+ .type kernel_dgemv_t_4_lib4, @function
+kernel_dgemv_t_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_4_lib4
+_kernel_dgemv_t_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_4_lib4
+ .def kernel_dgemv_t_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_4_lib4, .-kernel_dgemv_t_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dgemv_t_4_vs_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_4_vs_lib4
+ .type kernel_dgemv_t_4_vs_lib4, @function
+kernel_dgemv_t_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_4_vs_lib4
+_kernel_dgemv_t_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_4_vs_lib4
+ .def kernel_dgemv_t_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+ movq ARG9, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_4_vs_lib4, .-kernel_dgemv_t_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemv_t_4_gen_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_4_gen_lib4
+ .type kernel_dgemv_t_4_gen_lib4, @function
+kernel_dgemv_t_4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_4_gen_lib4
+_kernel_dgemv_t_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_4_gen_lib4
+ .def kernel_dgemv_t_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG6, %r13 // x
+ movq ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemv_add_t_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG9, %r10 // z
+ movq ARG10, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_4_gen_lib4, .-kernel_dgemv_t_4_gen_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_dtrsv_ln_inv_4_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_ln_inv_4_lib4
+ .type kernel_dtrsv_ln_inv_4_lib4, @function
+kernel_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_ln_inv_4_lib4
+_kernel_dtrsv_ln_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_ln_inv_4_lib4
+ .def kernel_dtrsv_ln_inv_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_ln_inv_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+ movq %r11, %r13 // A+k*sizeof(double)
+
+
+ // call inner blender n
+
+ movq ARG5, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq %r13, %r10 // A+k*sizeof(double)
+ movq ARG3, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LN_INV_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_ln_inv_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_ln_inv_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_ln_inv_4_lib4, .-kernel_dtrsv_ln_inv_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dtrsv_ln_inv_4_vs_lib4(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_ln_inv_4_vs_lib4
+ .type kernel_dtrsv_ln_inv_4_vs_lib4, @function
+kernel_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_ln_inv_4_vs_lib4
+_kernel_dtrsv_ln_inv_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_ln_inv_4_vs_lib4
+ .def kernel_dtrsv_ln_inv_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_ln_inv_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+ movq %r11, %r13
+
+
+ // call inner blender n
+
+ movq ARG5, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq %r13, %r10 // A+k*sizeof(double)
+ movq ARG3, %r11 // inv_diag_A
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LN_INV_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_ln_inv_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_ln_inv_4_vs_lib4
+#endif
+#endif
+
+
+ // store vs
+
+ movq ARG6, %r10 // z
+ movq ARG7, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_ln_inv_4_vs_lib4, .-kernel_dtrsv_ln_inv_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dtrsv_lt_inv_4_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_lt_inv_4_lib4
+ .type kernel_dtrsv_lt_inv_4_lib4, @function
+kernel_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_lt_inv_4_lib4
+_kernel_dtrsv_lt_inv_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_lt_inv_4_lib4
+ .def kernel_dtrsv_lt_inv_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ addq %r12, %r11 // A+4*sda*sizeof(double)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+4
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LT_INV_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_lt_inv_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_lt_inv_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_lt_inv_4_lib4, .-kernel_dtrsv_lt_inv_4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrsv_lt_inv_3_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_lt_inv_3_lib4
+ .type kernel_dtrsv_lt_inv_3_lib4, @function
+kernel_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_lt_inv_3_lib4
+_kernel_dtrsv_lt_inv_3_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_lt_inv_3_lib4
+ .def kernel_dtrsv_lt_inv_3_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_3_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ addq %r12, %r11 // A+4*sda*sizeof(double)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+4
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG1, %r12 // k
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LT_INV_3_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_lt_inv_3_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_lt_inv_3_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq $3, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_lt_inv_3_lib4, .-kernel_dtrsv_lt_inv_3_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrsv_lt_inv_2_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_lt_inv_2_lib4
+ .type kernel_dtrsv_lt_inv_2_lib4, @function
+kernel_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_lt_inv_2_lib4
+_kernel_dtrsv_lt_inv_2_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_lt_inv_2_lib4
+ .def kernel_dtrsv_lt_inv_2_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_2_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movslq %r12d, %r12
+ addq %r12, %r11 // A+4*sda*sizeof(double)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+4
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG1, %r12 // k
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LT_INV_2_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_lt_inv_2_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_lt_inv_2_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq $2, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_lt_inv_2_lib4, .-kernel_dtrsv_lt_inv_2_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrsv_lt_inv_1_lib4(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsv_lt_inv_1_lib4
+ .type kernel_dtrsv_lt_inv_1_lib4, @function
+kernel_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsv_lt_inv_1_lib4
+_kernel_dtrsv_lt_inv_1_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsv_lt_inv_1_lib4
+ .def kernel_dtrsv_lt_inv_1_lib4; .scl 2; .type 32; .endef
+kernel_dtrsv_lt_inv_1_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movslq %r12d, %r12
+ addq %r12, %r11 // A+4*sda*sizeof(double)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+4
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_4_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_4_lib4
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG1, %r12 // k
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSV_LT_INV_1_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsv_lt_inv_1_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsv_lt_inv_1_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq $1, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsv_lt_inv_1_lib4, .-kernel_dtrsv_lt_inv_1_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_dtrmv_un_4_lib4(int k, double *A, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_un_4_lib4
+ .type kernel_dtrmv_un_4_lib4, @function
+kernel_dtrmv_un_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_un_4_lib4
+_kernel_dtrmv_un_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_un_4_lib4
+ .def kernel_dtrmv_un_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_un_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dtrmv edge & dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMV_UN_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmv_un_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmv_un_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG4, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_un_4_lib4, .-kernel_dtrmv_un_4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_dtrmv_ut_4_lib4(int k, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_ut_4_lib4
+ .type kernel_dtrmv_ut_4_lib4, @function
+kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_ut_4_lib4
+_kernel_dtrmv_ut_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_ut_4_lib4
+ .def kernel_dtrmv_ut_4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_ut_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movslq %r12d, %r12
+ movq ARG4, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dtrmv_ut_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+ // call inner blend t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_ut_4_lib4, .-kernel_dtrmv_ut_4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9
+// void kernel_dtrmv_ut_4_vs_lib4(int k, double *A, int sda, double *x, double *y, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_ut_4_vs_lib4
+ .type kernel_dtrmv_ut_4_vs_lib4, @function
+kernel_dtrmv_ut_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_ut_4_vs_lib4
+_kernel_dtrmv_ut_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_ut_4_vs_lib4
+ .def kernel_dtrmv_ut_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_ut_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movslq %r12d, %r12
+ movq ARG4, %r13 // x
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_DTRMV_UT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dtrmv_ut_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dtrmv_ut_4_lib4
+#endif
+#endif
+
+
+ // call inner blend t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // z
+ movq ARG6, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_ut_4_vs_lib4, .-kernel_dtrmv_ut_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dgemv_nt_4_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_nt_4_lib4
+ .type kernel_dgemv_nt_4_lib4, @function
+kernel_dgemv_nt_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_nt_4_lib4
+_kernel_dgemv_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_nt_4_lib4
+ .def kernel_dgemv_nt_4_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_nt_4_lib4, .-kernel_dgemv_nt_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_dgemv_nt_4_vs_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_nt_4_vs_lib4
+ .type kernel_dgemv_nt_4_vs_lib4, @function
+kernel_dgemv_nt_4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_nt_4_vs_lib4
+_kernel_dgemv_nt_4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_nt_4_vs_lib4
+ .def kernel_dgemv_nt_4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+ movq ARG12, %r11 // km
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ cmpl $2, %r11d
+ jl 0f
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ cmpl $3, %r11d
+ jl 0f
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ je 0f
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+0:
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+ movq ARG12, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_nt_4_vs_lib4, .-kernel_dgemv_nt_4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsymv_l_4_lib4
+ .type kernel_dsymv_l_4_lib4, @function
+kernel_dsymv_l_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsymv_l_4_lib4
+_kernel_dsymv_l_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsymv_l_4_lib4
+ .def kernel_dsymv_l_4_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // x_t
+ movq ARG6, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dsymv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsymv_l_4_lib4, .-kernel_dsymv_l_4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dsymv_l_4_gen_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsymv_l_4_gen_lib4
+ .type kernel_dsymv_l_4_gen_lib4, @function
+kernel_dsymv_l_4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsymv_l_4_gen_lib4
+_kernel_dsymv_l_4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsymv_l_4_gen_lib4
+ .def kernel_dsymv_l_4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+ movq ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DSYMV_ADD_NT_4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dsymv_add_nt_4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dsymv_add_nt_4_gen_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z_t
+ movq ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsymv_l_4_gen_lib4, .-kernel_dsymv_l_4_gen_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgemv_8_lib4.S b/kernel/avx/kernel_dgemv_8_lib4.S
new file mode 100644
index 0000000..53d371e
--- /dev/null
+++ b/kernel/avx/kernel_dgemv_8_lib4.S
@@ -0,0 +1,1575 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x+k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_n_8_lib4, @function
+inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ cmpl $4, %r10d
+
+ prefetcht0 0(%r11) // software prefetch
+ prefetcht0 0(%r15) // software prefetch
+ prefetcht0 64(%r11) // software prefetch
+ prefetcht0 64(%r15) // software prefetch
+
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 128(%r11) // software prefetch
+ prefetcht0 128(%r15) // software prefetch
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ subl $4, %r10d
+
+ vbroadcastsd 8(%r13), %ymm12
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 32(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ prefetcht0 192(%r11) // software prefetch
+ prefetcht0 192(%r15) // software prefetch
+
+ vbroadcastsd 16(%r13), %ymm12
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vbroadcastsd 24(%r13), %ymm12
+ addq $32, %r13 // x+4
+ vmovapd 96(%r11), %ymm8
+ addq $128, %r11 // A0+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r15), %ymm8
+ addq $128, %r15 // A1+4*bs
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ addq $32, %r11
+ addq $32, %r15
+ addq $8, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_n_8_lib4, .-inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// r14 <- dirty
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_t_8_lib4, @function
+inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+
+ prefetcht0 0(%r11) // software prefetch
+ prefetcht0 64(%r11) // software prefetch
+ prefetcht0 128(%r11) // software prefetch
+ prefetcht0 192(%r11) // software prefetch
+
+ jl 0f // clean-up loop
+
+ movq %r11, %r14
+ addq %r12, %r14 // A+bs*sda
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+
+ vmovupd 0(%r13), %ymm12
+ addq $32, %r13 // x+4
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ prefetcht0 64(%r14) // software prefetch
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ prefetcht0 128(%r14) // software prefetch
+
+ vmovapd 128(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vmovapd 160(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ prefetcht0 192(%r14) // software prefetch
+
+ vmovapd 192(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vmovapd 224(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+// addq %r12, %r11 // A+bs*sda
+ movq %r14, %r11 // A+bs*sda
+ addq %r12, %r14 // A+bs*sda+bs*sda
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 128(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+
+ vmovapd 160(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+
+ vmovapd 192(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+
+ vmovapd 224(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_t_8_lib4, .-inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k-4
+// r11 <- A+4*4*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x+4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmv_un_8_lib4, @function
+inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmv_un_8_lib4:
+#endif
+#endif
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ // first 4 columns
+ vmovapd 0(%r11), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 8(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ vmovapd 64(%r11), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 16(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r13), %ymm12
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+
+ addq $128, %r11
+ addq $128, %r15
+ addq $32, %r13
+
+
+
+ // last 4 columns
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ subl $4, %r10d
+
+ vbroadcastsd 8(%r13), %ymm12
+ vmovapd 32(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 32(%r15), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vbroadcastsd 16(%r13), %ymm12
+ vmovapd 64(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 64(%r15), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ vbroadcastsd 24(%r13), %ymm12
+ vmovapd 96(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ addq $128, %r11
+ addq $128, %r15
+ addq $32, %r13
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmv_un_8_lib4, .-inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n
+//
+// input arguments:
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_8_lib4, @function
+inner_blend_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_8_lib4, .-inner_blend_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t
+//
+// input arguments:
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_8_lib4, @function
+inner_blend_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_8_lib4, .-inner_blend_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_8_lib4, @function
+inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_8_lib4, .-inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_8_lib4, @function
+inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm1, %ymm14, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_8_lib4, .-inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==n
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_n_8_lib4, @function
+inner_blender_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_n_8_lib4; .scl 2; .type 32; .endef
+inner_blender_n_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_n_8_lib4, .-inner_blender_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==t
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_t_8_lib4, @function
+inner_blender_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_t_8_lib4; .scl 2; .type 32; .endef
+inner_blender_t_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_t_8_lib4, .-inner_blender_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_lib4, @function
+inner_store_8_lib4:
+#elif defined(OS_MAC)
+_inner_store_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_lib4; .scl 2; .type 32; .endef
+inner_store_8_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %ymm1, 32(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_lib4, .-inner_store_8_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_n_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_8_lib4
+ .type kernel_dgemv_n_8_lib4, @function
+kernel_dgemv_n_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_8_lib4
+_kernel_dgemv_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_8_lib4
+ .def kernel_dgemv_n_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_8_lib4, .-kernel_dgemv_n_8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_8_lib4
+ .type kernel_dgemv_t_8_lib4, @function
+kernel_dgemv_t_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_8_lib4
+_kernel_dgemv_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_8_lib4
+ .def kernel_dgemv_t_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_8_lib4, .-kernel_dgemv_t_8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_dtrmv_un_8_lib4(int k, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_un_8_lib4
+ .type kernel_dtrmv_un_8_lib4, @function
+kernel_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_un_8_lib4
+_kernel_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_un_8_lib4
+ .def kernel_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_un_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dtrmv edge & dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG4, %r13 // x
+
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmv_un_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+ // call inner blender n
+
+#if MACRO_LEVEL>=1
+ INNER_BLENDER_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_8_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_un_8_lib4, .-kernel_dtrmv_un_8_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_dgeqrf_4_lib4.c b/kernel/avx/kernel_dgeqrf_4_lib4.c
new file mode 100644
index 0000000..a5faf20
--- /dev/null
+++ b/kernel/avx/kernel_dgeqrf_4_lib4.c
@@ -0,0 +1,2751 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+#include "../../include/blasfeo_d_kernel.h"
+
+
+
+void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w1, w2, w3;
+ const int ps = 4;
+ // first column
+ beta = 0.0;
+ ii = 1;
+ if(m>1)
+ {
+ tmp = pD[1+ps*0];
+ beta += tmp*tmp;
+ if(m>2)
+ {
+ tmp = pD[2+ps*0];
+ beta += tmp*tmp;
+ if(m>3)
+ {
+ tmp = pD[3+ps*0];
+ beta += tmp*tmp;
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[0] = 0.0;
+ }
+ else
+ {
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[0] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[0+ps*0] = beta;
+ ii = 1;
+ if(m>1)
+ {
+ pD[1+ps*0] *= tmp;
+ if(m>2)
+ {
+ pD[2+ps*0] *= tmp;
+ if(m>3)
+ {
+ pD[3+ps*0] *= tmp;
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*0] *= tmp;
+ pD[1+ii*sdd+ps*0] *= tmp;
+ pD[2+ii*sdd+ps*0] *= tmp;
+ pD[3+ii*sdd+ps*0] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*0] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w1 = pD[0+ps*1];
+ w2 = pD[0+ps*2];
+ w3 = pD[0+ps*3];
+ if(m>1)
+ {
+ w1 += pD[1+ps*1] * pD[1+ps*0];
+ w2 += pD[1+ps*2] * pD[1+ps*0];
+ w3 += pD[1+ps*3] * pD[1+ps*0];
+ if(m>2)
+ {
+ w1 += pD[2+ps*1] * pD[2+ps*0];
+ w2 += pD[2+ps*2] * pD[2+ps*0];
+ w3 += pD[2+ps*3] * pD[2+ps*0];
+ if(m>3)
+ {
+ w1 += pD[3+ps*1] * pD[3+ps*0];
+ w2 += pD[3+ps*2] * pD[3+ps*0];
+ w3 += pD[3+ps*3] * pD[3+ps*0];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w1 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ w1 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ w1 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ w1 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w1 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ }
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ pD[0+ps*1] += w1;
+ pD[0+ps*2] += w2;
+ pD[0+ps*3] += w3;
+ if(m>1)
+ {
+ pD[1+ps*1] += w1 * pD[1+ps*0];
+ pD[1+ps*2] += w2 * pD[1+ps*0];
+ pD[1+ps*3] += w3 * pD[1+ps*0];
+ if(m>2)
+ {
+ pD[2+ps*1] += w1 * pD[2+ps*0];
+ pD[2+ps*2] += w2 * pD[2+ps*0];
+ pD[2+ps*3] += w3 * pD[2+ps*0];
+ if(m>3)
+ {
+ pD[3+ps*1] += w1 * pD[3+ps*0];
+ pD[3+ps*2] += w2 * pD[3+ps*0];
+ pD[3+ps*3] += w3 * pD[3+ps*0];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*1] += w1 * pD[0+ii*sdd+ps*0];
+ pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*0];
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*1] += w1 * pD[1+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*1] += w1 * pD[2+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*1] += w1 * pD[3+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*0];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*1] += w1 * pD[ll+ii*sdd+ps*0];
+ pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*0];
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*0];
+ }
+ if(m==1)
+ return;
+ // second column
+ beta = 0.0;
+ if(m>2)
+ {
+ tmp = pD[2+ps*1];
+ beta += tmp*tmp;
+ if(m>3)
+ {
+ tmp = pD[3+ps*1];
+ beta += tmp*tmp;
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[1] = 0.0;
+ }
+ else
+ {
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[1] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[1+ps*1] = beta;
+ if(m>2)
+ {
+ pD[2+ps*1] *= tmp;
+ if(m>3)
+ {
+ pD[3+ps*1] *= tmp;
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*1] *= tmp;
+ pD[1+ii*sdd+ps*1] *= tmp;
+ pD[2+ii*sdd+ps*1] *= tmp;
+ pD[3+ii*sdd+ps*1] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*1] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w2 = pD[1+ps*2];
+ w3 = pD[1+ps*3];
+ if(m>2)
+ {
+ w2 += pD[2+ps*2] * pD[2+ps*1];
+ w3 += pD[2+ps*3] * pD[2+ps*1];
+ if(m>3)
+ {
+ w2 += pD[3+ps*2] * pD[3+ps*1];
+ w3 += pD[3+ps*3] * pD[3+ps*1];
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ }
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ pD[1+ps*2] += w2;
+ pD[1+ps*3] += w3;
+ if(m>2)
+ {
+ pD[2+ps*2] += w2 * pD[2+ps*1];
+ pD[2+ps*3] += w3 * pD[2+ps*1];
+ if(m>3)
+ {
+ pD[3+ps*2] += w2 * pD[3+ps*1];
+ pD[3+ps*3] += w3 * pD[3+ps*1];
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*1];
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*1];
+ pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*1];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*1];
+ pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*1];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*1];
+ pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*1];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*1];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*1];
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*1];
+ }
+ if(m==2)
+ return;
+ // third column
+ beta = 0.0;
+ if(m>3)
+ {
+ tmp = pD[3+ps*2];
+ beta += tmp*tmp;
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[2] = 0.0;
+ }
+ else
+ {
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[2] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[2+ps*2] = beta;
+ if(m>3)
+ {
+ pD[3+ps*2] *= tmp;
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*2] *= tmp;
+ pD[1+ii*sdd+ps*2] *= tmp;
+ pD[2+ii*sdd+ps*2] *= tmp;
+ pD[3+ii*sdd+ps*2] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*2] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w3 = pD[2+ps*3];
+ if(m>3)
+ {
+ w3 += pD[3+ps*3] * pD[3+ps*2];
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ w3 = - dD[2] * w3;
+ pD[2+ps*3] += w3;
+ if(m>3)
+ {
+ pD[3+ps*3] += w3 * pD[3+ps*2];
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*2];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*2];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*2];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*2];
+ }
+ if(m==3)
+ return;
+ // fourth column
+ beta = 0.0;
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[3] = 0.0;
+ }
+ else
+ {
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[3] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[3+ps*3] = beta;
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*3] *= tmp;
+ pD[1+ii*sdd+ps*3] *= tmp;
+ pD[2+ii*sdd+ps*3] *= tmp;
+ pD[3+ii*sdd+ps*3] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*3] *= tmp;
+ }
+ }
+ return;
+ }
+
+
+// unblocked algorithm
+void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+ const int ps = 4;
+ imax = k;//m<n ? m : n;
+ double alpha, beta, tmp, w0;
+ double *pC00, *pC10, *pC01, *pC11;
+ int offset;
+ double *pD0 = pD-offD;
+ for(ii=0; ii<imax; ii++)
+ {
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ jmax = m-ii-1;
+ jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ offset = 0;
+ jj = 0;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ tmp = pC10[1+offset];
+ beta += tmp*tmp;
+ tmp = pC10[2+offset];
+ beta += tmp*tmp;
+ tmp = pC10[3+offset];
+ beta += tmp*tmp;
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ offset += 1;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ offset = 0;
+ jj = 0;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ pC10[0+offset] *= tmp;
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ pC10[0+offset] *= tmp;
+ pC10[1+offset] *= tmp;
+ pC10[2+offset] *= tmp;
+ pC10[3+offset] *= tmp;
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ pC10[0+offset] *= tmp;
+ offset += 1;
+ }
+ pC00[0] = beta;
+ }
+ if(ii<n)
+ {
+ pC01 = pC00 + ps;
+ pC11 = pC10 + ps;
+ kmax = jmax;
+ kmax0 = jmax0;
+ jmax = n-ii-1;
+ jj = 0;
+ for( ; jj<jmax; jj++)
+ {
+ w0 = pC01[0+ps*jj] * 1.0;
+ offset = 0;
+ kk = 0;
+ if(kmax0>0)
+ {
+ for( ; kk<kmax0; kk++)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; kk<kmax-3; kk+=4)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ w0 += pC11[1+offset+ps*jj] * pC10[1+offset];
+ w0 += pC11[2+offset+ps*jj] * pC10[2+offset];
+ w0 += pC11[3+offset+ps*jj] * pC10[3+offset];
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<kmax-kk; ll++)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ offset += 1;
+ }
+ w0 = - dD[ii] * w0;
+ pC01[0+ps*jj] += w0;
+ offset = 0;
+ kk = 0;
+ if(kmax0>0)
+ {
+ for( ; kk<kmax0; kk++)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ offset += 1;
+ }
+ offset = offset-ps+ps*sdd;
+ }
+ for( ; kk<kmax-3; kk+=4)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ pC11[1+offset+ps*jj] += w0 * pC10[1+offset];
+ pC11[2+offset+ps*jj] += w0 * pC10[2+offset];
+ pC11[3+offset+ps*jj] += w0 * pC10[3+offset];
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<kmax-kk; ll++)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ offset += 1;
+ }
+ }
+ }
+ }
+ return;
+ }
+
+
+
+void kernel_dlarf_4_lib4(int m, int n, double *pD, int sdd, double *dD, double *pC0, int sdc)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, ll;
+ const int ps = 4;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ double tmp, d0, d1, d2, d3;
+ double *pC;
+ double pT[16];// = {};
+ int ldt = 4;
+ double pW[8];// = {};
+ int ldw = 2;
+ // dot product of v
+ v10 = 0.0;
+ v20 = 0.0;
+ v30 = 0.0;
+ v21 = 0.0;
+ v31 = 0.0;
+ v32 = 0.0;
+ if(m>1)
+ {
+ v10 = 1.0 * pD[1+ps*0];
+ if(m>2)
+ {
+ v10 += pD[2+ps*1] * pD[2+ps*0];
+ v20 = 1.0 * pD[2+ps*0];
+ v21 = 1.0 * pD[2+ps*1];
+ if(m>3)
+ {
+ v10 += pD[3+ps*1] * pD[3+ps*0];
+ v20 += pD[3+ps*2] * pD[3+ps*0];
+ v21 += pD[3+ps*2] * pD[3+ps*1];
+ v30 = 1.0 * pD[3+ps*0];
+ v31 = 1.0 * pD[3+ps*1];
+ v32 = 1.0 * pD[3+ps*2];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ // compute lower triangular T containing tau for matrix update
+ pT[0+ldt*0] = dD[0];
+ pT[1+ldt*1] = dD[1];
+ pT[2+ldt*2] = dD[2];
+ pT[3+ldt*3] = dD[3];
+ pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+ pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+ pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+ pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+ pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+ pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+ // downgrade matrix
+ pW[0] = 0.0;
+ pW[1] = 0.0;
+ pW[2] = 0.0;
+ pW[3] = 0.0;
+ pW[4] = 0.0;
+ pW[5] = 0.0;
+ pW[6] = 0.0;
+ pW[7] = 0.0;
+ ii = 0;
+ for( ; ii<n-1; ii+=2)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ tmp = pC[0+ps*1];
+ pW[1+ldw*0] = tmp;
+ if(m>1)
+ {
+ d0 = pD[1+ps*0];
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] = tmp;
+ tmp = pC[1+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] = tmp;
+ if(m>2)
+ {
+ d0 = pD[2+ps*0];
+ d1 = pD[2+ps*1];
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] = tmp;
+ tmp = pC[2+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] = tmp;
+ if(m>3)
+ {
+ d0 = pD[3+ps*0];
+ d1 = pD[3+ps*1];
+ d2 = pD[3+ps*2];
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] = tmp;
+ tmp = pC[3+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pD[0+jj*sdd+ps*0];
+ d1 = pD[0+jj*sdd+ps*1];
+ d2 = pD[0+jj*sdd+ps*2];
+ d3 = pD[0+jj*sdd+ps*3];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[0+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[1+jj*sdd+ps*0];
+ d1 = pD[1+jj*sdd+ps*1];
+ d2 = pD[1+jj*sdd+ps*2];
+ d3 = pD[1+jj*sdd+ps*3];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[1+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[2+jj*sdd+ps*0];
+ d1 = pD[2+jj*sdd+ps*1];
+ d2 = pD[2+jj*sdd+ps*2];
+ d3 = pD[2+jj*sdd+ps*3];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[2+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[3+jj*sdd+ps*0];
+ d1 = pD[3+jj*sdd+ps*1];
+ d2 = pD[3+jj*sdd+ps*2];
+ d3 = pD[3+jj*sdd+ps*3];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[3+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pD[ll+jj*sdd+ps*0];
+ d1 = pD[ll+jj*sdd+ps*1];
+ d2 = pD[ll+jj*sdd+ps*2];
+ d3 = pD[ll+jj*sdd+ps*3];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[ll+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ }
+ // compute W^T *= T
+ pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+ pW[1+ldw*3] = pT[3+ldt*0]*pW[1+ldw*0] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[1+ldw*2] + pT[3+ldt*3]*pW[1+ldw*3];
+ pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+ pW[1+ldw*2] = pT[2+ldt*0]*pW[1+ldw*0] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[1+ldw*2];
+ pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+ pW[1+ldw*1] = pT[1+ldt*0]*pW[1+ldw*0] + pT[1+ldt*1]*pW[1+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ pW[1+ldw*0] = pT[0+ldt*0]*pW[1+ldw*0];
+ // compute C -= V * W^T
+ pC[0+ps*0] -= pW[0+ldw*0];
+ pC[0+ps*1] -= pW[1+ldw*0];
+ if(m>1)
+ {
+ pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+ pC[1+ps*1] -= pD[1+ps*0]*pW[1+ldw*0] + pW[1+ldw*1];
+ if(m>2)
+ {
+ pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+ pC[2+ps*1] -= pD[2+ps*0]*pW[1+ldw*0] + pD[2+ps*1]*pW[1+ldw*1] + pW[1+ldw*2];
+ if(m>3)
+ {
+ pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+ pC[3+ps*1] -= pD[3+ps*0]*pW[1+ldw*0] + pD[3+ps*1]*pW[1+ldw*1] + pD[3+ps*2]*pW[1+ldw*2] + pW[1+ldw*3];
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pD[0+jj*sdd+ps*0];
+ d1 = pD[0+jj*sdd+ps*1];
+ d2 = pD[0+jj*sdd+ps*2];
+ d3 = pD[0+jj*sdd+ps*3];
+ pC[0+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[0+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[1+jj*sdd+ps*0];
+ d1 = pD[1+jj*sdd+ps*1];
+ d2 = pD[1+jj*sdd+ps*2];
+ d3 = pD[1+jj*sdd+ps*3];
+ pC[1+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[1+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[2+jj*sdd+ps*0];
+ d1 = pD[2+jj*sdd+ps*1];
+ d2 = pD[2+jj*sdd+ps*2];
+ d3 = pD[2+jj*sdd+ps*3];
+ pC[2+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[2+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[3+jj*sdd+ps*0];
+ d1 = pD[3+jj*sdd+ps*1];
+ d2 = pD[3+jj*sdd+ps*2];
+ d3 = pD[3+jj*sdd+ps*3];
+ pC[3+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[3+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pD[ll+jj*sdd+ps*0];
+ d1 = pD[ll+jj*sdd+ps*1];
+ d2 = pD[ll+jj*sdd+ps*2];
+ d3 = pD[ll+jj*sdd+ps*3];
+ pC[ll+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[ll+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ if(m>1)
+ {
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += tmp * pD[1+ps*0];
+ pW[0+ldw*1] = tmp;
+ if(m>2)
+ {
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += tmp * pD[2+ps*0];
+ pW[0+ldw*1] += tmp * pD[2+ps*1];
+ pW[0+ldw*2] = tmp;
+ if(m>3)
+ {
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += tmp * pD[3+ps*0];
+ pW[0+ldw*1] += tmp * pD[3+ps*1];
+ pW[0+ldw*2] += tmp * pD[3+ps*2];
+ pW[0+ldw*3] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[0+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[0+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[0+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[0+jj*sdd+ps*3];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[1+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[1+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[1+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[1+jj*sdd+ps*3];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[2+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[2+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[2+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[2+jj*sdd+ps*3];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[3+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[3+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[3+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[3+jj*sdd+ps*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[ll+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[ll+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[ll+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[ll+jj*sdd+ps*3];
+ }
+ // compute W^T *= T
+ pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+ pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+ pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ // compute C -= V * W^T
+ pC[0+ps*0] -= pW[0+ldw*0];
+ if(m>1)
+ {
+ pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+ if(m>2)
+ {
+ pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+ if(m>3)
+ {
+ pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ pC[0+jj*sdc+ps*0] -= pD[0+jj*sdd+ps*0]*pW[0+ldw*0] + pD[0+jj*sdd+ps*1]*pW[0+ldw*1] + pD[0+jj*sdd+ps*2]*pW[0+ldw*2] + pD[0+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[1+jj*sdc+ps*0] -= pD[1+jj*sdd+ps*0]*pW[0+ldw*0] + pD[1+jj*sdd+ps*1]*pW[0+ldw*1] + pD[1+jj*sdd+ps*2]*pW[0+ldw*2] + pD[1+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[2+jj*sdc+ps*0] -= pD[2+jj*sdd+ps*0]*pW[0+ldw*0] + pD[2+jj*sdd+ps*1]*pW[0+ldw*1] + pD[2+jj*sdd+ps*2]*pW[0+ldw*2] + pD[2+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[3+jj*sdc+ps*0] -= pD[3+jj*sdd+ps*0]*pW[0+ldw*0] + pD[3+jj*sdd+ps*1]*pW[0+ldw*1] + pD[3+jj*sdd+ps*2]*pW[0+ldw*2] + pD[3+jj*sdd+ps*3]*pW[0+ldw*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ pC[ll+jj*sdc+ps*0] -= pD[ll+jj*sdd+ps*0]*pW[0+ldw*0] + pD[ll+jj*sdd+ps*1]*pW[0+ldw*1] + pD[ll+jj*sdd+ps*2]*pW[0+ldw*2] + pD[ll+jj*sdd+ps*3]*pW[0+ldw*3];
+ }
+ }
+
+ return;
+ }
+
+
+
+void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc, double *pW0)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, ll;
+ const int ps = 4;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ double c00, c01,
+ c10, c11,
+ c20, c21,
+ c30, c31;
+ double a0, a1, a2, a3, b0, b1;
+ double tmp, d0, d1, d2, d3;
+ double *pC, *pW;
+ double pT[16];// = {};
+ int ldt = 4;
+ // dot product of v
+ v10 = 0.0;
+ v20 = 0.0;
+ v30 = 0.0;
+ v21 = 0.0;
+ v31 = 0.0;
+ v32 = 0.0;
+ if(m>1)
+ {
+ v10 = 1.0 * pD[1+ps*0];
+ if(m>2)
+ {
+ v10 += pD[2+ps*1] * pD[2+ps*0];
+ v20 = 1.0 * pD[2+ps*0];
+ v21 = 1.0 * pD[2+ps*1];
+ if(m>3)
+ {
+ v10 += pD[3+ps*1] * pD[3+ps*0];
+ v20 += pD[3+ps*2] * pD[3+ps*0];
+ v21 += pD[3+ps*2] * pD[3+ps*1];
+ v30 = 1.0 * pD[3+ps*0];
+ v31 = 1.0 * pD[3+ps*1];
+ v32 = 1.0 * pD[3+ps*2];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ // compute lower triangular T containing tau for matrix update
+ pT[0+ldt*0] = dD[0];
+ pT[1+ldt*1] = dD[1];
+ pT[2+ldt*2] = dD[2];
+ pT[3+ldt*3] = dD[3];
+ pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+ pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+ pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+ pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+ pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+ pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+ // downgrade matrix
+ __m256d
+ _w0, _w1, _w2, _w3, _d0, _t0, _tp, _c0, _c1, _c2, _c3, _a0, _b0, _tz;
+
+ ii = 0;
+#if 1
+ double alpha = 1.0;
+ double beta = 0.0;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for( ; ii<n-11; ii+=12)
+ {
+ kernel_dgemm_nn_4x12_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+ }
+#endif
+ for( ; ii<n-7; ii+=8)
+ {
+ kernel_dgemm_nn_4x8_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+ }
+ for( ; ii<n-3; ii+=4)
+ {
+ kernel_dgemm_nn_4x4_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii]);
+ }
+ if(ii<n)
+ {
+// kernel_dgemm_nn_4x4_vs_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, &pW0[0+ps*ii], &pW0[0+ps*ii], 4, n-ii);
+ kernel_dgemm_nn_4x4_gen_lib4(m, &alpha, &pVt[0+ps*0], 0, &pC0[0+ps*ii], sdc, &beta, 0, &pW0[0+ps*ii], 0, 0, &pW0[0+ps*ii], 0, 0, 4, 0, n-ii);
+ }
+#else
+ for( ; ii<n-3; ii+=4)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ _w0 = _mm256_setzero_pd();
+ _w1 = _mm256_setzero_pd();
+ _w2 = _mm256_setzero_pd();
+ _w3 = _mm256_setzero_pd();
+ for(jj=0; jj<m-3; jj+=4)
+ {
+ //
+ _d0 = _mm256_load_pd( &pVt[0+ps*(0+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[0+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ //
+ _d0 = _mm256_load_pd( &pVt[0+ps*(1+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[1+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ //
+ _d0 = _mm256_load_pd( &pVt[0+ps*(2+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[2+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ //
+ _d0 = _mm256_load_pd( &pVt[0+ps*(3+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[3+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ _d0 = _mm256_load_pd( &pVt[0+ps*(ll+jj)] );
+ _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*0] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*1] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*2] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _t0 = _mm256_broadcast_sd( &pC[ll+jj*sdc+ps*3] );
+ _tp = _mm256_mul_pd( _d0, _t0 );
+ _w3 = _mm256_add_pd( _w3, _tp );
+ }
+ // TODO mask store
+ _mm256_storeu_pd( &pW[0+ps*0], _w0 );
+ _mm256_storeu_pd( &pW[0+ps*1], _w1 );
+ _mm256_storeu_pd( &pW[0+ps*2], _w2 );
+ _mm256_storeu_pd( &pW[0+ps*3], _w3 );
+ }
+ for( ; ii<n; ii++)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ps*0] = tmp;
+ if(m>1)
+ {
+ d0 = pVt[0+ps*1];
+ tmp = pC[1+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] = tmp;
+ if(m>2)
+ {
+ d0 = pVt[0+ps*2];
+ d1 = pVt[1+ps*2];
+ tmp = pC[2+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] = tmp;
+ if(m>3)
+ {
+ d0 = pVt[0+ps*3];
+ d1 = pVt[1+ps*3];
+ d2 = pVt[2+ps*3];
+ tmp = pC[3+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pVt[0+ps*(0+jj)];
+ d1 = pVt[1+ps*(0+jj)];
+ d2 = pVt[2+ps*(0+jj)];
+ d3 = pVt[3+ps*(0+jj)];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(1+jj)];
+ d1 = pVt[1+ps*(1+jj)];
+ d2 = pVt[2+ps*(1+jj)];
+ d3 = pVt[3+ps*(1+jj)];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(2+jj)];
+ d1 = pVt[1+ps*(2+jj)];
+ d2 = pVt[2+ps*(2+jj)];
+ d3 = pVt[3+ps*(2+jj)];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(3+jj)];
+ d1 = pVt[1+ps*(3+jj)];
+ d2 = pVt[2+ps*(3+jj)];
+ d3 = pVt[3+ps*(3+jj)];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pVt[0+ps*(ll+jj)];
+ d1 = pVt[1+ps*(ll+jj)];
+ d2 = pVt[2+ps*(ll+jj)];
+ d3 = pVt[3+ps*(ll+jj)];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ps*0] += d0 * tmp;
+ pW[1+ps*0] += d1 * tmp;
+ pW[2+ps*0] += d2 * tmp;
+ pW[3+ps*0] += d3 * tmp;
+ }
+ }
+#endif
+
+ ii = 0;
+ for( ; ii<n-3; ii+=4)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+
+ // compute W^T *= T
+ _tz = _mm256_setzero_pd();
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*0] );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*0] );
+ _w0 = _mm256_mul_pd( _t0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*1] );
+ _w1 = _mm256_mul_pd( _t0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*2] );
+ _w2 = _mm256_mul_pd( _t0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*3] );
+ _w3 = _mm256_mul_pd( _t0, _tp );
+
+#if defined(TARGET_X64_INTEL_GASWELL)
+ _t0 = _mm256_load_pd( &pT[0+ldt*1] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+ _w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*1] );
+ _w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*2] );
+ _w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*3] );
+ _w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*2] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+ _w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*1] );
+ _w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*2] );
+ _w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*3] );
+ _w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*3] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+ _w0 = _mm256_fmadd_pd( _t0, _tp, _w0 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*1] );
+ _w1 = _mm256_fmadd_pd( _t0, _tp, _w1 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*2] );
+ _w2 = _mm256_fmadd_pd( _t0, _tp, _w2 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*3] );
+ _w3 = _mm256_fmadd_pd( _t0, _tp, _w3 );
+#else
+ _t0 = _mm256_load_pd( &pT[0+ldt*1] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*1] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*2] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*3] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w3 = _mm256_add_pd( _w3, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*2] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*1] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*2] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*3] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w3 = _mm256_add_pd( _w3, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*3] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*1] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w1 = _mm256_add_pd( _w1, _tp );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*2] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w2 = _mm256_add_pd( _w2, _tp );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*3] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w3 = _mm256_add_pd( _w3, _tp );
+#endif
+
+ _mm256_store_pd( &pW[0+ps*0], _w0 );
+ _mm256_store_pd( &pW[0+ps*1], _w1 );
+ _mm256_store_pd( &pW[0+ps*2], _w2 );
+ _mm256_store_pd( &pW[0+ps*3], _w3 );
+ }
+ for( ; ii<n; ii++)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+
+ // compute W^T *= T
+ _tz = _mm256_setzero_pd();
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*0] );
+ _tp = _mm256_broadcast_sd( &pW[0+ps*0] );
+ _w0 = _mm256_mul_pd( _t0, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*1] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x1 );
+ _tp = _mm256_broadcast_sd( &pW[1+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*2] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x3 );
+ _tp = _mm256_broadcast_sd( &pW[2+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+
+ _t0 = _mm256_load_pd( &pT[0+ldt*3] );
+ _t0 = _mm256_blend_pd( _t0, _tz, 0x7 );
+ _tp = _mm256_broadcast_sd( &pW[3+ps*0] );
+ _tp = _mm256_mul_pd( _t0, _tp );
+ _w0 = _mm256_add_pd( _w0, _tp );
+
+ _mm256_store_pd( &pW[0+ps*0], _w0 );
+ }
+
+ ii = 0;
+ for( ; ii<n-3; ii+=4)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ // compute C -= V * W^T
+ jj = 0;
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ c01 = pC[0+jj*sdc+ps*1];
+ c11 = pC[1+jj*sdc+ps*1];
+ c21 = pC[2+jj*sdc+ps*1];
+ c31 = pC[3+jj*sdc+ps*1];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[0+ps*1];
+ c01 -= b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[1+ps*1];
+ c11 -= b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c20 -= b0;
+ c30 -= a3*b0;
+ b1 = pW[2+ps*1];
+ c21 -= b1;
+ c31 -= a3*b1;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c30 -= b0;
+ b1 = pW[3+ps*1];
+ c31 -= b1;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[0+jj*sdc+ps*1] = c01;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[1+jj*sdc+ps*1] = c11;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[2+jj*sdc+ps*1] = c21;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*0] = c30;
+ pC[3+jj*sdc+ps*1] = c31;
+ }
+ }
+ }
+ // load
+ c00 = pC[0+jj*sdc+ps*2];
+ c10 = pC[1+jj*sdc+ps*2];
+ c20 = pC[2+jj*sdc+ps*2];
+ c30 = pC[3+jj*sdc+ps*2];
+ c01 = pC[0+jj*sdc+ps*3];
+ c11 = pC[1+jj*sdc+ps*3];
+ c21 = pC[2+jj*sdc+ps*3];
+ c31 = pC[3+jj*sdc+ps*3];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ps*2];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[0+ps*3];
+ c01 -= b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ps*2];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[1+ps*3];
+ c11 -= b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ps*2];
+ c20 -= b0;
+ c30 -= a3*b0;
+ b1 = pW[2+ps*3];
+ c21 -= b1;
+ c31 -= a3*b1;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ps*2];
+ c30 -= b0;
+ b1 = pW[3+ps*3];
+ c31 -= b1;
+ // store
+ pC[0+jj*sdc+ps*2] = c00;
+ pC[0+jj*sdc+ps*3] = c01;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*2] = c10;
+ pC[1+jj*sdc+ps*3] = c11;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*2] = c20;
+ pC[2+jj*sdc+ps*3] = c21;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*2] = c30;
+ pC[3+jj*sdc+ps*3] = c31;
+ }
+ }
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ // compute C -= V * W^T
+ jj = 0;
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c20 -= b0;
+ c30 -= a3*b0;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c30 -= b0;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*0] = c10;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*0] = c20;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*0] = c30;
+ }
+ }
+ }
+ }
+
+#if 1
+ jj = 4;
+#if defined(TARGET_X64_INTEL_HASWELL)
+ for(; jj<m-11; jj+=12)
+ {
+ kernel_dger4_sub_12r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
+ }
+#endif
+ for(; jj<m-7; jj+=8)
+ {
+ kernel_dger4_sub_8r_lib4(n, &pD[jj*sdd], sdd, &pW0[0], &pC0[jj*sdc], sdc);
+ }
+ for(; jj<m-3; jj+=4)
+ {
+ kernel_dger4_sub_4r_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc]);
+ }
+ if(jj<m)
+ {
+ kernel_dger4_sub_4r_vs_lib4(n, &pD[jj*sdd], &pW0[0], &pC0[jj*sdc], m-jj);
+ }
+#else
+ ii = 0;
+ for( ; ii<n-3; ii+=4)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ // load
+ _c0 = _mm256_load_pd( &pC[0+jj*sdc+ps*0] );
+ _c1 = _mm256_load_pd( &pC[0+jj*sdc+ps*1] );
+ _c2 = _mm256_load_pd( &pC[0+jj*sdc+ps*2] );
+ _c3 = _mm256_load_pd( &pC[0+jj*sdc+ps*3] );
+ //
+ _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*0] );
+ _b0 = _mm256_broadcast_sd( &pW[0+ps*0] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c0 = _mm256_sub_pd( _c0, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[0+ps*1] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c1 = _mm256_sub_pd( _c1, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[0+ps*2] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c2 = _mm256_sub_pd( _c2, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[0+ps*3] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c3 = _mm256_sub_pd( _c3, _tp );
+ //
+ _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*1] );
+ _b0 = _mm256_broadcast_sd( &pW[1+ps*0] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c0 = _mm256_sub_pd( _c0, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[1+ps*1] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c1 = _mm256_sub_pd( _c1, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[1+ps*2] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c2 = _mm256_sub_pd( _c2, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[1+ps*3] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c3 = _mm256_sub_pd( _c3, _tp );
+ //
+ _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*2] );
+ _b0 = _mm256_broadcast_sd( &pW[2+ps*0] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c0 = _mm256_sub_pd( _c0, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[2+ps*1] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c1 = _mm256_sub_pd( _c1, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[2+ps*2] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c2 = _mm256_sub_pd( _c2, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[2+ps*3] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c3 = _mm256_sub_pd( _c3, _tp );
+ //
+ _a0 = _mm256_load_pd( &pD[0+jj*sdd+ps*3] );
+ _b0 = _mm256_broadcast_sd( &pW[3+ps*0] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c0 = _mm256_sub_pd( _c0, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[3+ps*1] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c1 = _mm256_sub_pd( _c1, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[3+ps*2] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c2 = _mm256_sub_pd( _c2, _tp );
+ _b0 = _mm256_broadcast_sd( &pW[3+ps*3] );
+ _tp = _mm256_mul_pd( _a0, _b0 );
+ _c3 = _mm256_sub_pd( _c3, _tp );
+ // store
+ _mm256_store_pd( &pC[0+jj*sdc+ps*0], _c0 );
+ _mm256_store_pd( &pC[0+jj*sdc+ps*1], _c1 );
+ _mm256_store_pd( &pC[0+jj*sdc+ps*2], _c2 );
+ _mm256_store_pd( &pC[0+jj*sdc+ps*3], _c3 );
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ // load
+ c00 = pC[ll+jj*sdc+ps*0];
+ c01 = pC[ll+jj*sdc+ps*1];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= a0*b0;
+ b1 = pW[0+ps*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c00 -= a0*b0;
+ b1 = pW[1+ps*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c00 -= a0*b0;
+ b1 = pW[2+ps*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c00 -= a0*b0;
+ b1 = pW[3+ps*1];
+ c01 -= a0*b1;
+ // store
+ pC[ll+jj*sdc+ps*0] = c00;
+ pC[ll+jj*sdc+ps*1] = c01;
+ // load
+ c00 = pC[ll+jj*sdc+ps*2];
+ c01 = pC[ll+jj*sdc+ps*3];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ps*2];
+ c00 -= a0*b0;
+ b1 = pW[0+ps*3];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ps*2];
+ c00 -= a0*b0;
+ b1 = pW[1+ps*3];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ps*2];
+ c00 -= a0*b0;
+ b1 = pW[2+ps*3];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ps*2];
+ c00 -= a0*b0;
+ b1 = pW[3+ps*3];
+ c01 -= a0*b1;
+ // store
+ pC[ll+jj*sdc+ps*2] = c00;
+ pC[ll+jj*sdc+ps*3] = c01;
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pW = pW0+ii*ps;
+ pC = pC0+ii*ps;
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ //
+ a0 = pD[0+jj*sdd+ps*0];
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*1];
+ a1 = pD[1+jj*sdd+ps*1];
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*2];
+ a1 = pD[1+jj*sdd+ps*2];
+ a2 = pD[2+jj*sdd+ps*2];
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*3];
+ a1 = pD[1+jj*sdd+ps*3];
+ a2 = pD[2+jj*sdd+ps*3];
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[3+jj*sdc+ps*0] = c30;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ // load
+ c00 = pC[ll+jj*sdc+ps*0];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ps*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ps*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ps*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ps*0];
+ c00 -= a0*b0;
+ // store
+ pC[ll+jj*sdc+ps*0] = c00;
+ }
+ }
+#endif
+
+ return;
+ }
+
+
+
+// assume n>=4
+void kernel_dgelqf_4_lib4(int n, double *pD, double *dD)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w1, w2, w3;
+ const int ps = 4;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[0] = 0.0;
+ tmp = 0.0;
+ goto col2;
+ }
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[0] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[0+ps*0] = beta;
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ //
+ pD[0+ps*1] *= tmp;
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ //
+ pD[0+ps*2] *= tmp;
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ //
+ pD[0+ps*3] *= tmp;
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ //
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ //
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ //
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ //
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ beta = pD[1+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ beta += pD[1+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ beta += pD[1+ps*ii] * pD[1+ps*ii];
+ }
+ // second column
+col2:
+ if(beta==0.0)
+ {
+ dD[1] = 0.0;
+ tmp = 0.0;
+ goto col3;
+ }
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[1] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[1+ps*1] = beta;
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ //
+ pD[1+ps*2] *= tmp;
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] *= tmp;
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ //
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ //
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ //
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ //
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ beta = pD[2+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ beta += pD[2+ps*ii] * pD[2+ps*ii];
+ }
+ // third column
+col3:
+ if(beta==0.0)
+ {
+ dD[2] = 0.0;
+ tmp = 0.0;
+ goto col4;
+ }
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[2] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[2+ps*2] = beta;
+ w3 = pD[3+ps*2];
+ //
+ pD[2+ps*3] *= tmp;
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ //
+ w3 = - dD[2] * w3;
+ //
+ pD[3+ps*2] += w3;
+ //
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ //
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ beta += pD[3+ps*ii] * pD[3+ps*ii];
+ }
+ // fourth column
+col4:
+ if(beta==0.0)
+ {
+ dD[3] = 0.0;
+ tmp = 0.0;
+ return;
+ }
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[3] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[3+ps*3] = beta;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ }
+ return;
+ }
+
+
+
+// unblocked algorithm
+void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+ const int ps = 4;
+ imax = k;//m<n ? m : n;
+ double alpha, beta, tmp;
+ double w00, w01,
+ w10, w11,
+ w20, w21,
+ w30, w31;
+ __m256d
+ _a0, _b0, _t0, _w0, _w1;
+ double *pC00, *pC10, *pC10a, *pC20, *pC20a, *pC01, *pC11;
+ double pT[4];
+ int ldt = 2;
+ double *pD0 = pD-offD;
+ ii = 0;
+#if 1 // rank 2
+ for(; ii<imax-1; ii+=2)
+ {
+ // first row
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC00[0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ps*jj] *= tmp;
+ }
+ pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ kmax = n-ii;
+ w00 = pC10[0+ps*0]; // pC00[0+ps*0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ pC10[0+ps*0] += w00; // pC00[0+ps*0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ // second row
+ pC11 = pC10+ps*1;
+ beta = 0.0;
+ for(jj=1; jj<n-(ii+1); jj++)
+ {
+ tmp = pC11[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[(ii+1)] = 0.0;
+ }
+ else
+ {
+ alpha = pC11[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[(ii+1)] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC11[0+ps*0] = beta;
+ for(jj=1; jj<n-(ii+1); jj++)
+ pC11[0+ps*jj] *= tmp;
+ }
+ // compute T
+ kmax = n-ii;
+ tmp = 1.0*0.0 + pC00[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ tmp += pC00[0+ps*kk]*pC10[0+ps*kk];
+ pT[0+ldt*0] = - dD[ii+0];
+ pT[0+ldt*1] = + dD[ii+1] * tmp * dD[ii+0];
+ pT[1+ldt*1] = - dD[ii+1];
+ // downgrade
+ kmax = n-ii;
+ jmax = m-ii-2;
+ jmax0 = (ps-((ii+2+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ jj = 0;
+ pC20a = &pD0[((offD+ii+2)&(ps-1))+((offD+ii+2)-((offD+ii+2)&(ps-1)))*sdd+ii*ps];
+ pC20 = pC20a;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
+ w00 = w00*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ }
+ pC20 += 1;
+ }
+ pC20 += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ //
+ _w0 = _mm256_load_pd( &pC20[0+ps*0] );
+ _a0 = _mm256_load_pd( &pC20[0+ps*1] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
+ _t0 = _mm256_mul_pd( _a0, _b0 );
+ _w0 = _mm256_add_pd( _w0, _t0 );
+ _w1 = _mm256_load_pd( &pC20[0+ps*1] );
+ for(kk=2; kk<kmax; kk++)
+ {
+ _a0 = _mm256_load_pd( &pC20[0+ps*kk] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _a0, _b0 );
+ _w0 = _mm256_add_pd( _w0, _t0 );
+ _b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _a0, _b0 );
+ _w1 = _mm256_add_pd( _w1, _t0 );
+ }
+ //
+ _b0 = _mm256_broadcast_sd( &pT[1+ldt*1] );
+ _w1 = _mm256_mul_pd( _w1, _b0 );
+ _b0 = _mm256_broadcast_sd( &pT[0+ldt*1] );
+ _t0 = _mm256_mul_pd( _w0, _b0 );
+ _w1 = _mm256_add_pd( _w1, _t0 );
+ _b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
+ _w0 = _mm256_mul_pd( _w0, _b0 );
+ //
+ _a0 = _mm256_load_pd( &pC20[0+ps*0] );
+ _a0 = _mm256_add_pd( _a0, _w0 );
+ _mm256_store_pd( &pC20[0+ps*0], _a0 );
+ _a0 = _mm256_load_pd( &pC20[0+ps*1] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*1] );
+ _t0 = _mm256_mul_pd( _w0, _b0 );
+ _a0 = _mm256_add_pd( _a0, _t0 );
+ _a0 = _mm256_add_pd( _a0, _w1 );
+ _mm256_store_pd( &pC20[0+ps*1], _a0 );
+ for(kk=2; kk<kmax; kk++)
+ {
+ _a0 = _mm256_load_pd( &pC20[0+ps*kk] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _w0, _b0 );
+ _a0 = _mm256_add_pd( _a0, _t0 );
+ _b0 = _mm256_broadcast_sd( &pC10[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _w1, _b0 );
+ _a0 = _mm256_add_pd( _a0, _t0 );
+ _mm256_store_pd( &pC20[0+ps*kk], _a0 );
+ }
+ pC20 += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = w00*pT[0+ldt*1] + w01*pT[1+ldt*1];
+ w00 = w00*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ }
+ pC20 += 1;
+ }
+ }
+#endif
+ for(; ii<imax; ii++)
+ {
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC00[0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ps*jj] *= tmp;
+ }
+ if(ii<n)
+ {
+ // compute T
+ pT[0+ldt*0] = - dD[ii+0];
+ // downgrade
+ kmax = n-ii;
+ jmax = m-ii-1;
+ jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ jj = 0;
+ pC10a = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ pC10 = pC10a;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ w00 = pC10[0+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = w00*pT[0+ldt*0];
+ pC10[0+ps*0] += w00;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ pC10 += 1;
+ }
+ pC10 += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ //
+ _w0 = _mm256_load_pd( &pC10[0+ps*0] );
+ for(kk=1; kk<kmax; kk++)
+ {
+ _a0 = _mm256_load_pd( &pC10[0+ps*kk] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _a0, _b0 );
+ _w0 = _mm256_add_pd( _w0, _t0 );
+ }
+ //
+ _b0 = _mm256_broadcast_sd( &pT[0+ldt*0] );
+ _w0 = _mm256_mul_pd( _w0, _b0 );
+ //
+ _a0 = _mm256_load_pd( &pC10[0+ps*0] );
+ _a0 = _mm256_add_pd( _a0, _w0 );
+ _mm256_store_pd( &pC10[0+ps*0], _a0 );
+ for(kk=1; kk<kmax; kk++)
+ {
+ _a0 = _mm256_load_pd( &pC10[0+ps*kk] );
+ _b0 = _mm256_broadcast_sd( &pC00[0+ps*kk] );
+ _t0 = _mm256_mul_pd( _w0, _b0 );
+ _a0 = _mm256_add_pd( _a0, _t0 );
+ _mm256_store_pd( &pC10[0+ps*kk], _a0 );
+ }
+ pC10 += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ w00 = pC10[0+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = w00*pT[0+ldt*0];
+ pC10[0+ps*0] += w00;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ pC10 += 1;
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// assume kmax>=4
+void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT)
+ {
+ const int ps = 4;
+ int kk;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ // 0
+ // 1
+ v10 = pD[0+ps*1];
+ // 2
+ v10 += pD[1+ps*2]*pD[0+ps*2];
+ v20 = pD[0+ps*2];
+ v21 = pD[1+ps*2];
+ // 3
+ v10 += pD[1+ps*3]*pD[0+ps*3];
+ v20 += pD[2+ps*3]*pD[0+ps*3];
+ v21 += pD[2+ps*3]*pD[1+ps*3];
+ v30 = pD[0+ps*3];
+ v31 = pD[1+ps*3];
+ v32 = pD[2+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ v10 += pD[1+ps*kk]*pD[0+ps*kk];
+ v20 += pD[2+ps*kk]*pD[0+ps*kk];
+ v30 += pD[3+ps*kk]*pD[0+ps*kk];
+ v21 += pD[2+ps*kk]*pD[1+ps*kk];
+ v31 += pD[3+ps*kk]*pD[1+ps*kk];
+ v32 += pD[3+ps*kk]*pD[2+ps*kk];
+ }
+ pT[0+ps*0] = - dD[0];
+ pT[1+ps*1] = - dD[1];
+ pT[2+ps*2] = - dD[2];
+ pT[3+ps*3] = - dD[3];
+ pT[0+ps*1] = - dD[1] * (v10*pT[0+ps*0]);
+ pT[1+ps*2] = - dD[2] * (v21*pT[1+ps*1]);
+ pT[2+ps*3] = - dD[3] * (v32*pT[2+ps*2]);
+ pT[0+ps*2] = - dD[2] * (v20*pT[0+ps*0] + v21*pT[0+ps*1]);
+ pT[1+ps*3] = - dD[3] * (v31*pT[1+ps*1] + v32*pT[1+ps*2]);
+ pT[0+ps*3] = - dD[3] * (v30*pT[0+ps*0] + v31*pT[0+ps*1] + v32*pT[0+ps*2]);
+ return;
+ }
+
+
+
+// assume n>=4
+#if ! defined(TARGET_X64_INTEL_HASWELL)
+void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w0, w1, w2, w3;
+ const int ps = 4;
+ // zero tau matrix
+ for(ii=0; ii<16; ii++)
+ pT[ii] = 0.0;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[0] = 0.0;
+ tmp = 0.0;
+ goto col2;
+ }
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[0] = (beta-alpha) / beta;
+ pT[0+ps*0] = - dD[0];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[0+ps*0] = beta;
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ //
+ pD[0+ps*1] *= tmp;
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ //
+ pD[0+ps*2] *= tmp;
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ //
+ pD[0+ps*3] *= tmp;
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ //
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ //
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ //
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ //
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ beta = pD[1+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ beta += pD[1+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ beta += pD[1+ps*ii] * pD[1+ps*ii];
+ }
+ // second column
+col2:
+ if(beta==0.0)
+ {
+ dD[1] = 0.0;
+ tmp = 0.0;
+ goto col3;
+ }
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[1] = (beta-alpha) / beta;
+ pT[1+ps*1] = - dD[1];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[1+ps*1] = beta;
+ w0 = pD[0+ps*1]; //
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ //
+ pD[1+ps*2] *= tmp;
+ w0 += pD[0+ps*2] * pD[1+ps*2]; //
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[1+ps*3]; //
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ //
+ pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ //
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ //
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ //
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ beta = pD[2+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ beta += pD[2+ps*ii] * pD[2+ps*ii];
+ }
+ // third column
+col3:
+ if(beta==0.0)
+ {
+ dD[2] = 0.0;
+ tmp = 0.0;
+ goto col4;
+ }
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[2] = (beta-alpha) / beta;
+ pT[2+ps*2] = - dD[2];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[2+ps*2] = beta;
+ w0 = pD[0+ps*2];
+ w1 = pD[1+ps*2];
+ w3 = pD[3+ps*2];
+ //
+ pD[2+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[2+ps*3];
+ w1 += pD[1+ps*3] * pD[2+ps*3];
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[2+ps*ii];
+ w1 += pD[1+ps*ii] * pD[2+ps*ii];
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ //
+ pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+ pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+ w3 = - dD[2] * w3;
+ //
+ pD[3+ps*2] += w3;
+ //
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ //
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ beta += pD[3+ps*ii] * pD[3+ps*ii];
+ }
+ // fourth column
+col4:
+ if(beta==0.0)
+ {
+ dD[3] = 0.0;
+ tmp = 0.0;
+ return;
+ }
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[3] = (beta-alpha) / beta;
+ pT[3+ps*3] = - dD[3];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[3+ps*3] = beta;
+ w0 = pD[0+ps*3];
+ w1 = pD[1+ps*3];
+ w2 = pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[3+ps*ii];
+ w1 += pD[1+ps*ii] * pD[3+ps*ii];
+ w2 += pD[2+ps*ii] * pD[3+ps*ii];
+ }
+ //
+ pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+ pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+ pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+ return;
+ }
+#endif
+
+
+
+void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD)
+ {
+ const int ps = 4;
+ double pW[16];
+ int kk;
+ // 0
+ pW[0+ps*0] = pD[0+ps*0];
+ // 1
+ pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+ pW[0+ps*1] = pD[0+ps*1];
+ // 2
+ pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+ pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+ pW[0+ps*2] = pD[0+ps*2];
+ // 3
+ pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+ pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+ pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+ pW[0+ps*3] = pD[0+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+ pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+ pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+ pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+ }
+ //
+ pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+ //
+ pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+ //
+ pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+ //
+ pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+ //
+ pD[0+ps*0] += pW[0+ps*0];
+ //
+ pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+ //
+ pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+ //
+ pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+ for(kk=4; kk<kmax; kk++)
+ {
+ pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+ }
+ return;
+ }
+
+
+
+
diff --git a/kernel/avx/kernel_dgetrf_pivot_4_lib4.c b/kernel/avx/kernel_dgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..91d1cc0
--- /dev/null
+++ b/kernel/avx/kernel_dgetrf_pivot_4_lib4.c
@@ -0,0 +1,1434 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+// C numering (starting from zero) in the ipiv
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ __m128d
+ max0, max1, msk0, imx0, imx1,
+ inv;
+
+
+ __m256d
+ lft, msk,
+ sgn, vna, max, imx, idx,
+ ones,
+ tmp,
+ a_0,
+ b_0, b_1, b_2,
+ scl,
+ c_0,
+ d_0;
+
+ double
+ dlft;
+
+ sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+ vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+ lft = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+ double
+ tmp0;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ int B_pref = bs*sda;
+
+
+ // first column
+
+ // find pivot
+ pB = &pA[0+bs*0];
+ idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ k = 0;
+ for( ; k<m-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for( ; k<m-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<m)
+ {
+ dlft = m-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ a_0 = _mm256_load_pd( &pB[0] );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ inv = _mm_loaddup_pd( &pA[0+bs*0] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[0], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[0] = 0.0;
+ }
+
+
+ // second column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ c_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x1 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ _mm256_store_pd( &pA[0+bs*1], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[1] = idamax+1;
+ if(tmp0!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ inv = _mm_loaddup_pd( &pA[1+bs*1] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[1], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[1] = 0.0;
+ }
+
+
+ // third column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x3 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ _mm256_store_pd( &pA[0+bs*2], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[2] = idamax+2;
+ if(tmp0!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ inv = _mm_loaddup_pd( &pA[2+bs*2] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[2], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[2] = 0.0;
+ }
+
+
+ // fourth column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x7 );
+ b_2 = _mm256_permute_pd( b_2, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x7 );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ _mm256_store_pd( &pA[0+bs*3], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[3] = idamax+3;
+ if(tmp0!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ inv = _mm_loaddup_pd( &pA[3+bs*3] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[3], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[3] = 0.0;
+ }
+
+ // scale
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ tmp = _mm256_mul_pd( c_0, scl );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+// pB += B_pref;
+ }
+
+ return;
+
+ }
+
+
+
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ __m128d
+ max0, max1, msk0, imx0, imx1,
+ inv;
+
+
+ __m256d
+ lft, msk,
+ sgn, vna, max, imx, idx,
+ ones,
+ tmp,
+ a_0,
+ b_0, b_1, b_2,
+ scl,
+ c_0,
+ d_0;
+
+ double
+ dlft;
+
+ sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+ vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+ lft = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+ double
+ tmp0;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ int B_pref = bs*sda;
+
+
+ // first column
+
+ // find pivot
+ pB = &pA[0+bs*0];
+ idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ k = 0;
+ for( ; k<m-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for( ; k<m-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<m)
+ {
+ dlft = m-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ a_0 = _mm256_load_pd( &pB[0] );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ inv = _mm_loaddup_pd( &pA[0+bs*0] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[0], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[0] = 0.0;
+ }
+
+ if(n==1)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+
+ // second column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ c_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ d_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ _mm256_store_pd( &pA[0+bs*1], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>1)
+ {
+ ipiv[1] = idamax+1;
+ if(tmp0!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ inv = _mm_loaddup_pd( &pA[1+bs*1] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[1], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[1] = 0.0;
+ }
+ }
+
+ if(n==2)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+ // third column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ _mm256_store_pd( &pA[0+bs*2], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>2)
+ {
+ ipiv[2] = idamax+2;
+ if(tmp0!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ inv = _mm_loaddup_pd( &pA[2+bs*2] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[2], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[2] = 0.0;
+ }
+ }
+
+ if(n==3)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+ // fourth column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_2 = _mm256_permute_pd( b_2, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x7 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ _mm256_store_pd( &pA[0+bs*3], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>3)
+ {
+ ipiv[3] = idamax+3;
+ if(tmp0!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ inv = _mm_loaddup_pd( &pA[3+bs*3] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[3], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[3] = 0.0;
+ }
+ }
+
+ // scale
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ tmp = _mm256_mul_pd( c_0, scl );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+// pB += B_pref;
+ }
+
+ return;
+
+ }
+
diff --git a/kernel/avx/kernel_dsymv_6_lib4.S b/kernel/avx/kernel_dsymv_6_lib4.S
new file mode 100644
index 0000000..b55690a
--- /dev/null
+++ b/kernel/avx/kernel_dsymv_6_lib4.S
@@ -0,0 +1,1031 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4 <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5 <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4 <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5 <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_nt_6_lib4, @function
+inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_nt_6_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_nt_6_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovapd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 64(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 96(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm14, %ymm9, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 128(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm14, %ymm10, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 160(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm14, %ymm11, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+ vmaskmovpd 0(%r14), %ymm14, %ymm13
+
+ vmovupd %ymm14, -32(%rsp) // spill mask to stack
+
+// vmaskmovpd -32(%rsp), %ymm14
+ vmaskmovpd 0(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 32(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 64(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 96(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm14, %ymm9, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 128(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm14, %ymm10, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd 160(%r11), %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm14, %ymm11, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovpd %ymm13, %ymm14, 0(%r14)
+
+ sall $3, %r10d
+ addq %r10, %r11
+ addq %r10, %r13
+ addq %r10, %r14
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_nt_6_lib4, .-inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+
+
+
+
+#if 0
+
+// TODO
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dsymv_add_nt_4_lib4, @function
+inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dsymv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_lib4:
+#endif
+#endif
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovapd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 32(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 64(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 96(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// vxorpd %ymm15, %ymm15, %ymm15
+// vblendpd $0x0, %ymm14, %ymm15, %ymm14
+// vmulpd %ymm14, %ymm9, %ymm15
+// vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ subq $4, %r10
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dsymv_add_nt_4_lib4, .-inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 xx xx]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_6_lib4, @function
+inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_6_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_6_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm5, %ymm4, %ymm4
+// vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vextractf128 $0x1, %ymm4, %xmm5
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm4
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm4, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmovupd 32(%r12), %ymm13
+ vmulpd %ymm15, %ymm14, %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+ vmulpd %ymm15, %ymm13, %ymm13
+ vaddpd %ymm1, %ymm13, %ymm1
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm1, %ymm15, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_6_lib4, .-inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+
+
+
+#if 0
+
+//TODO
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_a1_4_lib4, @function
+inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_a1_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vmovupd 0(%r11), %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_a1_4_lib4, .-inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_6_lib4, @function
+inner_store_6_lib4:
+#elif defined(OS_MAC)
+_inner_store_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_6_lib4; .scl 2; .type 32; .endef
+inner_store_6_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %xmm1, 32(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_6_lib4, .-inner_store_6_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp_32 rsp_40
+// void kernel_dgemv_nt_6_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_nt_6_lib4
+ .type kernel_dgemv_nt_6_lib4, @function
+kernel_dgemv_nt_6_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_nt_6_lib4
+_kernel_dgemv_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_nt_6_lib4
+ .def kernel_dgemv_nt_6_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_6_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+ vbroadcastsd 32(%r10), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vbroadcastsd 40(%r10), %ymm11
+ vmulpd %ymm15, %ymm11, %ymm11
+
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_6_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_nt_6_lib4, .-kernel_dgemv_nt_6_lib4
+#endif
+
+
+
+
+
+#if 0
+// TODO
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x_n, double *x_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsymv_l_4_lib4
+ .type kernel_dsymv_l_4_lib4, @function
+kernel_dsymv_l_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsymv_l_4_lib4
+_kernel_dsymv_l_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsymv_l_4_lib4
+ .def kernel_dsymv_l_4_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dsymv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsymv_l_4_lib4, .-kernel_dsymv_l_4_lib4
+#endif
+
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
+
+
+
+
diff --git a/kernel/avx/kernel_sgead_lib8.S b/kernel/avx/kernel_sgead_lib8.S
new file mode 100644
index 0000000..4cafa0a
--- /dev/null
+++ b/kernel/avx/kernel_sgead_lib8.S
@@ -0,0 +1,3096 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_0_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_0_lib8, @function
+inner_kernel_sgead_8_0_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_0_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_0_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r12
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps 64(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%r13), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_0_lib8, .-inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_0_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_0_gen_lib8, @function
+inner_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_0_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r12), %ymm0
+ vmaskmovps 0(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovups 32(%r12), %ymm0
+ vmaskmovps 32(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r12
+
+ vmovups -64(%r12), %ymm0
+ vmaskmovps 64(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovups -32(%r12), %ymm0
+ vmaskmovps -32(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovups 0(%r12), %ymm0
+ vmaskmovps 0(%r13), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_0_lib8, .-inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_1_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_1_lib8, @function
+inner_kernel_sgead_8_1_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_1_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_1_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+#if 1
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+#else
+ vmovups 4(%r12), %ymm0
+ vmovups -28(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovups 36(%r12), %ymm0
+ vmovups 4(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovups -60(%r12), %ymm0
+ vmovups -92(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovups -28(%r12), %ymm0
+ vmovups -60(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+#endif
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_1_lib8, .-inner_kernel_sgead_8_1_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_1_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_1_gen_lib8, @function
+inner_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_1_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_1_gen_lib8, .-inner_kernel_sgead_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_2_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_2_lib8, @function
+inner_kernel_sgead_8_2_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_2_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_2_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_2_lib8, .-inner_kernel_sgead_8_2_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_2_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_2_gen_lib8, @function
+inner_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_2_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_2_gen_lib8, .-inner_kernel_sgead_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_3_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_3_lib8, @function
+inner_kernel_sgead_8_3_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_3_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_3_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x03, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_3_lib8, .-inner_kernel_sgead_8_3_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_3_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_3_gen_lib8, @function
+inner_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_3_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_3_gen_lib8, .-inner_kernel_sgead_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_4_lib8, @function
+inner_kernel_sgead_8_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_4_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 16(%r12), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 48(%r12), %xmm0
+ vmovaps 32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+
+ vmovaps -48(%r12), %xmm0
+ vmovaps 64(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %rax
+
+ vmovaps -16(%r12), %xmm0
+ vmovaps -32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 96(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 96(%r14)
+ addq $128, %r14
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 16(%r12), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_4_lib8, .-inner_kernel_sgead_8_4_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_4_gen_lib8, @function
+inner_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_4_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 16(%r12), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 48(%r12), %xmm0
+ vmovaps 32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+
+ vmovaps -48(%r12), %xmm0
+ vmovaps 64(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %rax
+
+ vmovaps -16(%r12), %xmm0
+ vmovaps -32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 96(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 96(%r14)
+ addq $128, %r14
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 16(%r12), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_4_gen_lib8, .-inner_kernel_sgead_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_5_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_5_lib8, @function
+inner_kernel_sgead_8_5_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_5_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_5_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_5_lib8, .-inner_kernel_sgead_8_5_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_5_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_5_gen_lib8, @function
+inner_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_5_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_5_gen_lib8, .-inner_kernel_sgead_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_6_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_6_lib8, @function
+inner_kernel_sgead_8_6_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_6_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_6_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_6_lib8, .-inner_kernel_sgead_8_6_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_6_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_6_gen_lib8, @function
+inner_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_6_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_6_gen_lib8, .-inner_kernel_sgead_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_7_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_7_lib8, @function
+inner_kernel_sgead_8_7_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_7_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_7_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps 32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps 64(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps -32(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x03, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps 0(%r14), %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_7_lib8, .-inner_kernel_sgead_8_7_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r12 <- A
+// r13d <- 8*sda*sizeof(float)
+// r14 <- B
+// r15d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGEAD_8_7_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgead_8_7_gen_lib8, @function
+inner_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgead_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgead_8_7_gen_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm14
+
+ // compute mask for rows
+ vcvtsi2ss %r15d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r12, %rax // A1 <- A0
+ addq %r13, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps 32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r14)
+ addq $128, %r12
+ addq $128, %rax
+
+ vmovaps -64(%r12), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps 64(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r14)
+ addq $128, %r14
+
+ vmovaps -32(%r12), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps -32(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r14)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps 0(%r14), %ymm15, %ymm13
+ vmulps %ymm14, %ymm0, %ymm0
+ vaddps %ymm13, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r14)
+ subl $1, %r10d
+ addq $32, %r12
+ addq $32, %rax
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgead_8_7_gen_lib8, .-inner_kernel_sgead_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_sgead_8_0_lib8(int k, float *alpha, float *A, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_0_lib8
+ .type kernel_sgead_8_0_lib8, @function
+kernel_sgead_8_0_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_0_lib8
+_kernel_sgead_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_0_lib8
+ .def kernel_sgead_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_0_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_0_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_0_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_0_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_0_lib8, .-kernel_sgead_8_0_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_0_gen_lib8(int k, float *A, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_0_gen_lib8
+ .type kernel_sgead_8_0_gen_lib8, @function
+kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_0_gen_lib8
+_kernel_sgead_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_0_gen_lib8
+ .def kernel_sgead_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_0_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_0_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_0_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_0_gen_lib8, .-kernel_sgead_8_0_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_1_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_1_lib8
+ .type kernel_sgead_8_1_lib8, @function
+kernel_sgead_8_1_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_1_lib8
+_kernel_sgead_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_1_lib8
+ .def kernel_sgead_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_1_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_1_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_1_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_1_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_1_lib8, .-kernel_sgead_8_1_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_1_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_1_gen_lib8
+ .type kernel_sgead_8_1_gen_lib8, @function
+kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_1_gen_lib8
+_kernel_sgead_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_1_gen_lib8
+ .def kernel_sgead_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_1_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_1_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_1_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_1_gen_lib8, .-kernel_sgead_8_1_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_2_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_2_lib8
+ .type kernel_sgead_8_2_lib8, @function
+kernel_sgead_8_2_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_2_lib8
+_kernel_sgead_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_2_lib8
+ .def kernel_sgead_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_2_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_2_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_2_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_2_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_2_lib8, .-kernel_sgead_8_2_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_2_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_2_gen_lib8
+ .type kernel_sgead_8_2_gen_lib8, @function
+kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_2_gen_lib8
+_kernel_sgead_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_2_gen_lib8
+ .def kernel_sgead_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_2_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_2_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_2_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_2_gen_lib8, .-kernel_sgead_8_2_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_3_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_3_lib8
+ .type kernel_sgead_8_3_lib8, @function
+kernel_sgead_8_3_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_3_lib8
+_kernel_sgead_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_3_lib8
+ .def kernel_sgead_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_3_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_3_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_3_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_3_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_3_lib8, .-kernel_sgead_8_3_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_3_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_3_gen_lib8
+ .type kernel_sgead_8_3_gen_lib8, @function
+kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_3_gen_lib8
+_kernel_sgead_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_3_gen_lib8
+ .def kernel_sgead_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_3_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_3_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_3_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_3_gen_lib8, .-kernel_sgead_8_3_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_4_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_4_lib8
+ .type kernel_sgead_8_4_lib8, @function
+kernel_sgead_8_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_4_lib8
+_kernel_sgead_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_4_lib8
+ .def kernel_sgead_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_4_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_4_lib8, .-kernel_sgead_8_4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_4_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_4_gen_lib8
+ .type kernel_sgead_8_4_gen_lib8, @function
+kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_4_gen_lib8
+_kernel_sgead_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_4_gen_lib8
+ .def kernel_sgead_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_4_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_4_gen_lib8, .-kernel_sgead_8_4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_5_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_5_lib8
+ .type kernel_sgead_8_5_lib8, @function
+kernel_sgead_8_5_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_5_lib8
+_kernel_sgead_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_5_lib8
+ .def kernel_sgead_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_5_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_5_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_5_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_5_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_5_lib8, .-kernel_sgead_8_5_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_5_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_5_gen_lib8
+ .type kernel_sgead_8_5_gen_lib8, @function
+kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_5_gen_lib8
+_kernel_sgead_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_5_gen_lib8
+ .def kernel_sgead_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_5_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_5_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_5_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_5_gen_lib8, .-kernel_sgead_8_5_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_6_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_6_lib8
+ .type kernel_sgead_8_6_lib8, @function
+kernel_sgead_8_6_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_6_lib8
+_kernel_sgead_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_6_lib8
+ .def kernel_sgead_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_6_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_6_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_6_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_6_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_6_lib8, .-kernel_sgead_8_6_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_6_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_6_gen_lib8
+ .type kernel_sgead_8_6_gen_lib8, @function
+kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_6_gen_lib8
+_kernel_sgead_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_6_gen_lib8
+ .def kernel_sgead_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_6_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_6_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_6_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_6_gen_lib8, .-kernel_sgead_8_6_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgead_8_7_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_7_lib8
+ .type kernel_sgead_8_7_lib8, @function
+kernel_sgead_8_7_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_7_lib8
+_kernel_sgead_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_7_lib8
+ .def kernel_sgead_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_7_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_7_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_7_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_7_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_7_lib8, .-kernel_sgead_8_7_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgead_8_7_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgead_8_7_gen_lib8
+ .type kernel_sgead_8_7_gen_lib8, @function
+kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgead_8_7_gen_lib8
+_kernel_sgead_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgead_8_7_gen_lib8
+ .def kernel_sgead_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgead_8_7_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r13 // 8*sda*sizeof(float)
+ sall $5, %r13d
+ movq ARG5, %r14 // B
+ movq ARG6, %r15 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGEAD_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgead_8_7_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgead_8_7_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgead_8_7_gen_lib8, .-kernel_sgead_8_7_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgecp_lib8.S b/kernel/avx/kernel_sgecp_lib8.S
new file mode 100644
index 0000000..5cd2c00
--- /dev/null
+++ b/kernel/avx/kernel_sgecp_lib8.S
@@ -0,0 +1,2796 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_0_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_0_lib8, @function
+inner_kernel_sgecp_8_0_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_0_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_0_lib8:
+#endif
+#endif
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps %ymm0, 0(%r12)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps %ymm0, 32(%r12)
+ addq $128, %r11
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps %ymm0, 64(%r12)
+ addq $128, %r12
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps %ymm0, -32(%r12)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps %ymm0, 0(%r12)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_0_lib8, .-inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_0_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_0_gen_lib8, @function
+inner_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_0_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r12)
+ subl $4, %r10d
+
+ vmovups 32(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r12)
+ addq $128, %r11
+
+ vmovups -64(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r12)
+ addq $128, %r12
+
+ vmovups -32(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r12)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovups 0(%r11), %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r12)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_0_lib8, .-inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_1_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_1_lib8, @function
+inner_kernel_sgecp_8_1_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_1_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_1_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+#if 1
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+#else
+ vmovups 4(%r11), %ymm0
+ vmovups -28(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovups 36(%r11), %ymm0
+ vmovups 4(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovups -60(%r11), %ymm0
+ vmovups -92(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovups -28(%r11), %ymm0
+ vmovups -60(%rax), %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r13)
+#endif
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_1_lib8, .-inner_kernel_sgecp_8_1_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_1_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_1_gen_lib8, @function
+inner_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_1_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x77, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_1_gen_lib8, .-inner_kernel_sgecp_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_2_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_2_lib8, @function
+inner_kernel_sgecp_8_2_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_2_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_2_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_2_lib8, .-inner_kernel_sgecp_8_2_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_2_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_2_gen_lib8, @function
+inner_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_2_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x03, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x33, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_2_gen_lib8, .-inner_kernel_sgecp_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_3_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_3_lib8, @function
+inner_kernel_sgecp_8_3_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_3_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_3_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x03, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_3_lib8, .-inner_kernel_sgecp_8_3_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_3_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_3_gen_lib8, @function
+inner_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_3_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x07, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x11, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_3_gen_lib8, .-inner_kernel_sgecp_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_4_lib8, @function
+inner_kernel_sgecp_8_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_4_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 16(%r11), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 48(%r11), %xmm0
+ vmovaps 32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+
+ vmovaps -48(%r11), %xmm0
+ vmovaps 64(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %rax
+
+ vmovaps -16(%r11), %xmm0
+ vmovaps -32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 96(%r13)
+ addq $128, %r13
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 16(%r11), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_4_lib8, .-inner_kernel_sgecp_8_4_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_4_gen_lib8, @function
+inner_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 16(%r11), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 48(%r11), %xmm0
+ vmovaps 32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+
+ vmovaps -48(%r11), %xmm0
+ vmovaps 64(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %rax
+
+ vmovaps -16(%r11), %xmm0
+ vmovaps -32(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 96(%r13)
+ addq $128, %r13
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 16(%r11), %xmm0
+ vmovaps 0(%rax), %xmm1
+ vinsertf128 $0x01, %xmm1, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_4_gen_lib8, .-inner_kernel_sgecp_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_5_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_5_lib8, @function
+inner_kernel_sgecp_8_5_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_5_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_5_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_5_lib8, .-inner_kernel_sgecp_8_5_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_5_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_5_gen_lib8, @function
+inner_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_5_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x1f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x39, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0x88, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_5_gen_lib8, .-inner_kernel_sgecp_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_6_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_6_lib8, @function
+inner_kernel_sgecp_8_6_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_6_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_6_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_6_lib8, .-inner_kernel_sgecp_8_6_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_6_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_6_gen_lib8, @function
+inner_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_6_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x3f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x4e, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xcc, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_6_gen_lib8, .-inner_kernel_sgecp_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_7_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_7_lib8, @function
+inner_kernel_sgecp_8_7_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_7_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_7_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x03, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmovaps %ymm0, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_7_lib8, .-inner_kernel_sgecp_8_7_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12d <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGECP_8_7_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgecp_8_7_gen_lib8, @function
+inner_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgecp_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgecp_8_7_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(float)
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm0
+ vmovaps 32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r13)
+ addq $128, %r11
+ addq $128, %rax
+
+ vmovaps -64(%r11), %ymm0
+ vmovaps -64(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r13)
+ addq $128, %r13
+
+ vmovaps -32(%r11), %ymm0
+ vmovaps -32(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 0(%rax), %ymm1
+ vblendps $0x7f, %ymm1, %ymm0, %ymm0
+ vpermilps $0x93, %ymm0, %ymm0
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm1
+ vblendps $0xee, %ymm0, %ymm1, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r13)
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %rax
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgecp_8_7_gen_lib8, .-inner_kernel_sgecp_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx
+// void kernel_sgecp_8_0_lib8(int k, float *A, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_0_lib8
+ .type kernel_sgecp_8_0_lib8, @function
+kernel_sgecp_8_0_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_0_lib8
+_kernel_sgecp_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_0_lib8
+ .def kernel_sgecp_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_0_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_0_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_0_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_0_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_0_lib8, .-kernel_sgecp_8_0_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_0_gen_lib8(int k, float *A, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_0_gen_lib8
+ .type kernel_sgecp_8_0_gen_lib8, @function
+kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_0_gen_lib8
+_kernel_sgecp_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_0_gen_lib8
+ .def kernel_sgecp_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_0_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_0_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_0_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_0_gen_lib8, .-kernel_sgecp_8_0_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_1_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_1_lib8
+ .type kernel_sgecp_8_1_lib8, @function
+kernel_sgecp_8_1_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_1_lib8
+_kernel_sgecp_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_1_lib8
+ .def kernel_sgecp_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_1_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_1_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_1_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_1_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_1_lib8, .-kernel_sgecp_8_1_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_1_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_1_gen_lib8
+ .type kernel_sgecp_8_1_gen_lib8, @function
+kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_1_gen_lib8
+_kernel_sgecp_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_1_gen_lib8
+ .def kernel_sgecp_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_1_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_1_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_1_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_1_gen_lib8, .-kernel_sgecp_8_1_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_2_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_2_lib8
+ .type kernel_sgecp_8_2_lib8, @function
+kernel_sgecp_8_2_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_2_lib8
+_kernel_sgecp_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_2_lib8
+ .def kernel_sgecp_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_2_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_2_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_2_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_2_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_2_lib8, .-kernel_sgecp_8_2_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_2_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_2_gen_lib8
+ .type kernel_sgecp_8_2_gen_lib8, @function
+kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_2_gen_lib8
+_kernel_sgecp_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_2_gen_lib8
+ .def kernel_sgecp_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_2_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_2_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_2_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_2_gen_lib8, .-kernel_sgecp_8_2_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_3_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_3_lib8
+ .type kernel_sgecp_8_3_lib8, @function
+kernel_sgecp_8_3_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_3_lib8
+_kernel_sgecp_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_3_lib8
+ .def kernel_sgecp_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_3_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_3_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_3_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_3_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_3_lib8, .-kernel_sgecp_8_3_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_3_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_3_gen_lib8
+ .type kernel_sgecp_8_3_gen_lib8, @function
+kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_3_gen_lib8
+_kernel_sgecp_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_3_gen_lib8
+ .def kernel_sgecp_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_3_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_3_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_3_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_3_gen_lib8, .-kernel_sgecp_8_3_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_4_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_4_lib8
+ .type kernel_sgecp_8_4_lib8, @function
+kernel_sgecp_8_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_4_lib8
+_kernel_sgecp_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_4_lib8
+ .def kernel_sgecp_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_4_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_4_lib8, .-kernel_sgecp_8_4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_4_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_4_gen_lib8
+ .type kernel_sgecp_8_4_gen_lib8, @function
+kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_4_gen_lib8
+_kernel_sgecp_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_4_gen_lib8
+ .def kernel_sgecp_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_4_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_4_gen_lib8, .-kernel_sgecp_8_4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_5_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_5_lib8
+ .type kernel_sgecp_8_5_lib8, @function
+kernel_sgecp_8_5_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_5_lib8
+_kernel_sgecp_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_5_lib8
+ .def kernel_sgecp_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_5_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_5_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_5_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_5_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_5_lib8, .-kernel_sgecp_8_5_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_5_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_5_gen_lib8
+ .type kernel_sgecp_8_5_gen_lib8, @function
+kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_5_gen_lib8
+_kernel_sgecp_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_5_gen_lib8
+ .def kernel_sgecp_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_5_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_5_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_5_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_5_gen_lib8, .-kernel_sgecp_8_5_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_6_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_6_lib8
+ .type kernel_sgecp_8_6_lib8, @function
+kernel_sgecp_8_6_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_6_lib8
+_kernel_sgecp_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_6_lib8
+ .def kernel_sgecp_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_6_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_6_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_6_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_6_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_6_lib8, .-kernel_sgecp_8_6_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_6_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_6_gen_lib8
+ .type kernel_sgecp_8_6_gen_lib8, @function
+kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_6_gen_lib8
+_kernel_sgecp_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_6_gen_lib8
+ .def kernel_sgecp_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_6_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_6_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_6_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_6_gen_lib8, .-kernel_sgecp_8_6_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_7_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_7_lib8
+ .type kernel_sgecp_8_7_lib8, @function
+kernel_sgecp_8_7_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_7_lib8
+_kernel_sgecp_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_7_lib8
+ .def kernel_sgecp_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_7_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_7_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_7_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_7_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_7_lib8, .-kernel_sgecp_8_7_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgecp_8_7_gen_lib8(int k, float *A, int sda, float *B, int m0);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgecp_8_7_gen_lib8
+ .type kernel_sgecp_8_7_gen_lib8, @function
+kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgecp_8_7_gen_lib8
+_kernel_sgecp_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgecp_8_7_gen_lib8
+ .def kernel_sgecp_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgecp_8_7_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // 8*sda*sizeof(float)
+ sall $5, %r12d
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGECP_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgecp_8_7_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgecp_8_7_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgecp_8_7_gen_lib8, .-kernel_sgecp_8_7_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_16x4_lib8.S b/kernel/avx/kernel_sgemm_16x4_lib8.S
new file mode 100644
index 0000000..5c2d6c4
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_16x4_lib8.S
@@ -0,0 +1,7057 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_16x4_lib8, @function
+inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+
+ // preload
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+// 8 A0
+// 9 A1
+// 10 A0+
+// 11 A1+
+// 12 B
+// 13 B+
+// 14 Bt
+// 15 tmp
+
+ // unroll 0
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $4, %r10d
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r11), %ymm10 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r15), %ymm11 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r11), %ymm8 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r15), %ymm9 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ addq $128, %r13
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r11), %ymm10 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ addq $128, %r11
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r15), %ymm11 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ addq $128, %r15
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 0(%r11), %ymm8 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 0(%r15), %ymm9 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $4, %r10d
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r11), %ymm10 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r15), %ymm11 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r11), %ymm8 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r15), %ymm9 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ addq $128, %r13
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r11), %ymm10 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ addq $128, %r11
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r15), %ymm11 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ addq $128, %r15
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmulps %ymm10, %ymm14, %ymm15
+// vbroadcastf128 0(%r13), %ymm12 // B
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+// vmovaps 0(%r11), %ymm8 // A0
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+// vmovaps 0(%r15), %ymm9 // A1
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vaddps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+// vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r13
+ addq $32, %r15
+
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_16x4_lib8, .-inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_16x4_lib8, @function
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+
+ // preload
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+// 8 A0
+// 9 A1
+// 10 A0+
+// 11 A1+
+// 12 B
+// 13 B+
+// 14 Bt
+// 15 tmp
+
+ // unroll 0
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ subl $4, %r10d
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r11), %ymm10 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r15), %ymm11 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r11), %ymm8 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r15), %ymm9 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ addq $128, %r13
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r11), %ymm10 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ addq $128, %r11
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r15), %ymm11 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ addq $128, %r15
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 0(%r11), %ymm8 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 0(%r15), %ymm9 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ subl $4, %r10d
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r11), %ymm10 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 32(%r15), %ymm11 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmulps %ymm10, %ymm14, %ymm15
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r11), %ymm8 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vmovaps 64(%r15), %ymm9 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmulps %ymm8, %ymm14, %ymm15
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ addq $128, %r13
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r11), %ymm10 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ addq $128, %r11
+ vmulps %ymm8, %ymm14, %ymm15
+ vmovaps 96(%r15), %ymm11 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ addq $128, %r15
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmulps %ymm10, %ymm14, %ymm15
+// vbroadcastf128 0(%r13), %ymm12 // B
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vmulps %ymm10, %ymm14, %ymm15
+// vmovaps 0(%r11), %ymm8 // A0
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vmulps %ymm10, %ymm14, %ymm15
+// vmovaps 0(%r15), %ymm9 // A1
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm14, %ymm15
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+ vsubps %ymm15, %ymm6, %ymm6
+
+ vmulps %ymm10, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm14, %ymm15
+// vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vsubps %ymm15, %ymm7, %ymm7
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm14, %ymm15
+ vsubps %ymm15, %ymm4, %ymm4
+
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm14, %ymm15
+ vsubps %ymm15, %ymm5, %ymm5
+
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm14, %ymm15
+ vsubps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r13
+ addq $32, %r15
+
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vmulps %ymm8, %ymm14, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+ vmulps %ymm9, %ymm14, %ymm15
+ vsubps %ymm15, %ymm7, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_16x4_lib8, .-inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_16x4_lib8, @function
+inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+ vmovaps 0(%r11, %r12, 1), %ymm14 // A
+
+ cmpl $8, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 1) // software prefetch
+ prefetcht0 64(%r13, %r14, 1) // software prefetch
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 32(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 64(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 96(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ subl $8, %r10d
+
+ // unroll 1
+ vbroadcastss 4(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 100(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastss 8(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 104(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 3
+ vbroadcastss 12(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 128(%r11), %ymm13 // A
+ vbroadcastss 44(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 128(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 76(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 108(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 4
+ vbroadcastss 16(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 160(%r11), %ymm13 // A
+ vbroadcastss 48(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 160(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 80(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 112(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 5
+ vbroadcastss 20(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 192(%r11), %ymm13 // A
+ vbroadcastss 52(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 192(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 84(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 116(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 6
+ vbroadcastss 24(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 224(%r11), %ymm13 // A
+ vbroadcastss 56(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 224(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 88(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 120(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ addq $256, %r11
+
+ // unroll 7
+ vbroadcastss 28(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss 60(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 92(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss -4(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ addq %r14, %r13
+
+ cmpl $8, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $7, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 32(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 64(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 96(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ subl $8, %r10d
+
+ // unroll 1
+ vbroadcastss 4(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 100(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 2
+ vbroadcastss 8(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 96(%r11), %ymm10 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 104(%r13), %ymm12 // B
+ vmulps %ymm13, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 3
+ vbroadcastss 12(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 128(%r11), %ymm13 // A
+ vbroadcastss 44(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 128(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 76(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 108(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 4
+ vbroadcastss 16(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 160(%r11), %ymm13 // A
+ vbroadcastss 48(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 160(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 80(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 112(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 5
+ vbroadcastss 20(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 192(%r11), %ymm13 // A
+ vbroadcastss 52(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 192(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 84(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 116(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ // unroll 6
+ vbroadcastss 24(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovapd 224(%r11), %ymm13 // A
+ vbroadcastss 56(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovapd 224(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 88(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 120(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ addq $256, %r11
+
+ // unroll 7
+ vbroadcastss 28(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss 60(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 92(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 124(%r13), %ymm12 // B
+ vmulps %ymm10, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm11, %ymm12, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+ addq %r14, %r13
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vbroadcastss 0(%r13), %ymm14 // B[0]
+ vmulps %ymm12, %ymm14, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm13, %ymm14, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 32(%r13), %ymm14 // B[1]
+ vmulps %ymm12, %ymm14, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm13, %ymm14, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 64(%r13), %ymm14 // B[2]
+ vmulps %ymm12, %ymm14, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm13, %ymm14, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 96(%r13), %ymm14 // B[3]
+ vmulps %ymm12, %ymm14, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vmulps %ymm13, %ymm14, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_16x4_lib8, .-inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_16x4_lib8, @function
+inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %ebx
+ subl %r15d, %ebx // 8-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,8-offsetB)
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r13 // B+offsetB*sizeof(float)
+
+1:
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vbroadcastss 0(%r13), %ymm15 // B[0]
+ vmulps %ymm12, %ymm15, %ymm14
+ vaddps %ymm14, %ymm0, %ymm0
+ vmulps %ymm13, %ymm15, %ymm14
+ vaddps %ymm14, %ymm4, %ymm4
+ vbroadcastss 32(%r13), %ymm15 // B[1]
+ vmulps %ymm12, %ymm15, %ymm14
+ vaddps %ymm14, %ymm1, %ymm1
+ vmulps %ymm13, %ymm15, %ymm14
+ vaddps %ymm14, %ymm5, %ymm5
+ vbroadcastss 64(%r13), %ymm15 // B[2]
+ vmulps %ymm12, %ymm15, %ymm14
+ vaddps %ymm14, %ymm2, %ymm2
+ vmulps %ymm13, %ymm15, %ymm14
+ vaddps %ymm14, %ymm6, %ymm6
+ vbroadcastss 96(%r13), %ymm15 // B[3]
+ vmulps %ymm12, %ymm15, %ymm14
+ vaddps %ymm14, %ymm3, %ymm3
+ vmulps %ymm13, %ymm15, %ymm14
+ vaddps %ymm14, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // end-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_16x4_lib8, .-inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trmm_nn_rl_16x4_lib8, @function
+inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ movq %r13, %rbx // B
+ addq %rax, %rbx // B+offsetB*sizeof(float)
+
+
+ cmpl $4, %r15d
+ jg 1f
+
+ // offB==0, 1, 2, 3, 4
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+ cmpl $5, %r15d
+ jg 1f
+
+ // offB==5
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movl $0, %r15d // offsetB=0
+
+ jmp 0f // end
+
+
+1:
+ cmpl $6, %r15d
+ jg 1f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 64(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+// cmpl $7, %r15d
+// jg 0f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 68(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmulps %ymm9, %ymm12, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+// jmp 0f // end
+
+
+ // end
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trmm_nn_rl_16x4_lib8, .-inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_16x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_16x4_vs_lib8, .-inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_16x4_vs_lib8, @function
+inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_16x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_16x4_vs_lib8, .-inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_12x4_vs_lib8, @function
+inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_12x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vextractf128 $0x1, %ymm0, %xmm13
+// vpermilps $0x00, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm1, %xmm13
+ vpermilps $0x55, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vpermilps $0xaa, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilps $0xff, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_12x4_vs_lib8, .-inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_16x4_lib8, @function
+inner_scale_ab_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ movq %r12, %r15 // C1 <- C0
+ addq %r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_16x4_lib8, .-inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_16x4_gen_lib8, @function
+inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ movq %r13, %rax // C1 <- C0
+ addq %r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ vmovaps 0(%rax), %ymm14
+ vmulps %ymm14, %ymm15, %ymm14
+ vaddps %ymm4, %ymm14, %ymm4
+ vmovaps 32(%rax), %ymm14
+ vmulps %ymm14, %ymm15, %ymm14
+ vaddps %ymm5, %ymm14, %ymm5
+ vmovaps 64(%rax), %ymm14
+ vmulps %ymm14, %ymm15, %ymm14
+ vaddps %ymm6, %ymm14, %ymm6
+ vmovaps 96(%rax), %ymm14
+ vmulps %ymm14, %ymm15, %ymm14
+ vaddps %ymm7, %ymm14, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_16x4_gen_lib8, .-inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_16x4_lib8, @function
+inner_scale_a0_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_16x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_16x4_lib8, .-inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_16x4_lib8, @function
+inner_scale_11_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_lib8:
+#endif
+#endif
+
+ movq %r10, %r15 // C1 <- C0
+ addq %r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_16x4_lib8, .-inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_16x4_gen_lib8, @function
+inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_gen_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // C1 <- C0
+ addq %r12, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ vmovaps 0(%rax), %ymm14
+ vaddps %ymm4, %ymm14, %ymm4
+ vmovaps 32(%rax), %ymm14
+ vaddps %ymm5, %ymm14, %ymm5
+ vmovaps 64(%rax), %ymm14
+ vaddps %ymm6, %ymm14, %ymm6
+ vmovaps 96(%rax), %ymm14
+ vaddps %ymm7, %ymm14, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r12, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_16x4_gen_lib8, .-inner_scale_11_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_lib8, @function
+inner_store_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_lib8:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r15)
+ vmovaps %ymm5, 32(%r15)
+ vmovaps %ymm6, 64(%r15)
+ vmovaps %ymm7, 96(%r15)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_lib8, .-inner_store_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_vs_lib8, @function
+inner_store_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 7f // end
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 7f // end
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 7f // end
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+ jmp 0f
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_vs_lib8, .-inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_gen_lib8, @function
+inner_store_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute D1
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(float)
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ cmpl $2, %r15d
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%rbx)
+ jl 7f // end
+ cmpl $3, %r15d
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%rbx)
+ jl 7f // end
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%rbx)
+ je 7f // end
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%rbx)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbp // D1
+ addq %r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_gen_lib8, .-inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_lib8, @function
+inner_store_l_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_lib8:
+#endif
+#endif
+
+ vmovaps 32(%r10), %ymm12
+ vmovaps 64(%r10), %ymm13
+ vmovaps 96(%r10), %ymm14
+
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vblendps $0x03, %ymm13, %ymm2, %ymm2
+ vblendps $0x07, %ymm14, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_lib8, .-inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_vs_lib8, @function
+inner_store_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_vs_lib8, .-inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_gen_lib8, @function
+inner_store_l_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%r11, %r12, 1)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_gen_lib8, .-inner_store_l_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_lib8, @function
+inner_store_l_12x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_lib8:
+#endif
+#endif
+
+ vmovaps 0(%r10), %ymm12
+ vmovaps 32(%r10), %ymm13
+ vmovaps 64(%r10), %ymm14
+ vmovaps 96(%r10), %ymm15
+
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vblendps $0x1f, %ymm13, %ymm1, %ymm1
+ vblendps $0x3f, %ymm14, %ymm2, %ymm2
+ vblendps $0x7f, %ymm15, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_lib8, .-inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_vs_lib8, @function
+inner_store_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_vs_lib8, .-inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_gen_lib8, @function
+inner_store_l_12x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmovaps 0(%r11), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%r11, %r12, 1)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_gen_lib8, .-inner_store_l_12x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_sgemm_nt_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_lib8
+ .type kernel_sgemm_nt_16x4_lib8, @function
+kernel_sgemm_nt_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_lib8
+_kernel_sgemm_nt_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_lib8, .-kernel_sgemm_nt_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_sgemm_nt_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_vs_lib8
+ .type kernel_sgemm_nt_16x4_vs_lib8, @function
+kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_vs_lib8
+_kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_16x4_vs_lib8
+ .def kernel_sgemm_nt_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_vs_lib8, .-kernel_sgemm_nt_16x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_sgemm_nt_16x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_gen_lib8
+ .type kernel_sgemm_nt_16x4_gen_lib8, @function
+kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_gen_lib8
+_kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_16x4_gen_lib8
+ .def kernel_sgemm_nt_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_gen_lib8, .-kernel_sgemm_nt_16x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_sgemm_nn_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_lib8
+ .type kernel_sgemm_nn_16x4_lib8, @function
+kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_lib8
+_kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_lib8
+ .def kernel_sgemm_nn_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_lib8, .-kernel_sgemm_nn_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_nn_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_vs_lib8
+ .type kernel_sgemm_nn_16x4_vs_lib8, @function
+kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_vs_lib8
+_kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_vs_lib8
+ .def kernel_sgemm_nn_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_vs_lib8, .-kernel_sgemm_nn_16x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88 rsp+96
+// void kernel_sgemm_nn_16x4_gen_lib4(int k, float *alpha, float *A, int sda, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_gen_lib8
+ .type kernel_sgemm_nn_16x4_gen_lib8, @function
+kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_gen_lib8
+_kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_gen_lib8
+ .def kernel_sgemm_nn_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // offsetC
+ movq ARG10, %r13 // C
+ movq ARG11, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG12, %r10 // offsetD
+ movq ARG13, %r11 // D
+ movq ARG14, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG15, %r13 // m0
+ movq ARG16, %r14 // m1
+ movq ARG17, %r15 // n0
+ movq ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_gen_lib8, .-kernel_sgemm_nn_16x4_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_16x4_lib8
+ .type kernel_ssyrk_nt_l_16x4_lib8, @function
+kernel_ssyrk_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_16x4_lib8
+_kernel_ssyrk_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_16x4_lib8, .-kernel_ssyrk_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_ssyrk_nt_l_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_16x4_vs_lib8
+ .type kernel_ssyrk_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_16x4_vs_lib8
+_kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_16x4_vs_lib8
+ .def kernel_ssyrk_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_16x4_vs_lib8, .-kernel_ssyrk_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_12x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_12x4_lib8
+ .type kernel_ssyrk_nt_l_12x4_lib8, @function
+kernel_ssyrk_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_12x4_lib8
+_kernel_ssyrk_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_12x4_lib8, .-kernel_ssyrk_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_ssyrk_nt_l_12x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_12x4_vs_lib8
+ .type kernel_ssyrk_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_12x4_vs_lib8
+_kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_12x4_vs_lib8
+ .def kernel_ssyrk_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_12x4_vs_lib8, .-kernel_ssyrk_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_strsm_nt_rl_inv_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_16x4_lib8
+ .type kernel_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_16x4_lib8
+_kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_16x4_lib8
+ .def kernel_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_16x4_lib8, .-kernel_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_strsm_nt_rl_inv_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+ .type kernel_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+ .def kernel_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // m1
+ movq ARG12, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_12x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_12x4_lib8
+ .type kernel_spotrf_nt_l_12x4_lib8, @function
+kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_12x4_lib8
+_kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_12x4_lib8
+ .def kernel_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_12x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_12x4_vs_lib8
+ .type kernel_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_12x4_vs_lib8
+_kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_12x4_vs_lib8
+ .def kernel_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_16x4_lib8
+ .type kernel_spotrf_nt_l_16x4_lib8, @function
+kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_16x4_lib8
+_kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_16x4_lib8
+ .def kernel_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_16x4_vs_lib8
+ .type kernel_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_16x4_vs_lib8
+_kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_16x4_vs_lib8
+ .def kernel_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_12x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_12x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_12x4_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_12x4_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_16x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_16x4_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_16x4_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strmm_nn_rl_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_lib8
+ .type kernel_strmm_nn_rl_16x4_lib8, @function
+kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_lib8
+_kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_lib8
+ .def kernel_strmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_lib8, .-kernel_strmm_nn_rl_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_strmm_nn_rl_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_vs_lib8
+ .type kernel_strmm_nn_rl_16x4_vs_lib8, @function
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_vs_lib8
+_kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_vs_lib8
+ .def kernel_strmm_nn_rl_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_vs_lib8, .-kernel_strmm_nn_rl_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_strmm_nn_rl_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_gen_lib8
+ .type kernel_strmm_nn_rl_16x4_gen_lib8, @function
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_gen_lib8
+_kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_gen_lib8
+ .def kernel_strmm_nn_rl_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // offsetD
+ movq ARG9, %r11 // D
+ movq ARG10, %r12 // sdd
+ sall $5, %r12d // 4*sdd*sizeof(double)
+ movq ARG11, %r13 // m0
+ movq ARG12, %r14 // m1
+ movq ARG13, %r15 // n0
+ movq ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_gen_lib8, .-kernel_strmm_nn_rl_16x4_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_8x4_lib8.S b/kernel/avx/kernel_sgemm_8x4_lib8.S
new file mode 100644
index 0000000..d319a83
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_8x4_lib8.S
@@ -0,0 +1,6673 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_8x4_lib8, @function
+inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 128(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 32(%r11), %ymm13 // A
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm3, %ymm3
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+// vbroadcastf128 128(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+// vbroadcastf128 32(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm3, %ymm3
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm2, %ymm2
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm3, %ymm3
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_8x4_lib8, .-inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_8x4_lib8, @function
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 128(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 32(%r11), %ymm13 // A
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm3, %ymm3
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+// vbroadcastf128 128(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm15, %ymm11
+// vbroadcastf128 32(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm3, %ymm3
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm2, %ymm2
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm3, %ymm3
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_8x4_lib8, .-inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_8x4_lib8, @function
+inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r12, %r14 // B_next <- B
+ addq %r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+ prefetcht0 64(%r14) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 7
+ vmovaps 224(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ subl $8, %r10d
+ addq $256, %r11
+
+ mov %r14, %r12
+ addq %r13, %r14
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_8x4_lib8, .-inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nn_8x4_lib8, @function
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r12, %r14 // B_next <- B
+ addq %r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+ prefetcht0 64(%r14) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+
+ // unroll 7
+ vmovaps 224(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+ subl $8, %r10d
+ addq $256, %r11
+
+ mov %r14, %r12
+ addq %r13, %r14
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vsubps %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nn_8x4_lib8, .-inner_kernel_gemm_sub_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_8x4_lib8, @function
+inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %r15d
+ subl %r14d, %r15d // 8-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,8-offsetB)
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r12 // B+offsetB*sizeof(float)
+
+1:
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_8x4_lib8, .-inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trmm_nn_rl_8x4_lib8, @function
+inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ movq %r12, %rbx // B
+ addq %rax, %rbx // B+offsetB*sizeof(float)
+
+
+ cmpl $4, %r14d
+ jg 1f
+
+ // offB==0, 1, 2, 3, 4
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+ cmpl $5, %r14d
+ jg 1f
+
+ // offB==5
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movl $0, %r14d // offsetB=0
+
+ jmp 0f // end
+
+
+1:
+ cmpl $6, %r14d
+ jg 1f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movq %r12, %rbx // B
+ movl $0, %r14d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+// cmpl $7, %r14d
+// jg 0f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movq %r12, %rbx // B
+ movl $0, %r14d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+// jmp 0f // end
+
+
+ // end
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trmm_nn_rl_8x4_lib8, .-inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x4_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x4_lib8, .-inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x4_vs_lib8, .-inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x4_lib8, @function
+inner_edge_potrf_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x4_lib8, .-inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x4_vs_lib8, @function
+inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x4_vs_lib8, .-inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_lib8, @function
+inner_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_lib8, .-inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_lib8, @function
+inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm4
+ vmulps %ymm1, %ymm15, %ymm5
+ vmulps %ymm2, %ymm15, %ymm6
+ vmulps %ymm3, %ymm15, %ymm7
+
+ // transpose
+ vblendps $0xaa, %ymm5, %ymm4, %ymm0
+ vblendps $0xaa, %ymm5, %ymm5, %ymm1
+ vblendps $0xaa, %ymm6, %ymm7, %ymm2
+ vblendps $0xaa, %ymm7, %ymm6, %ymm3
+
+ vunpcklps %ymm1, %ymm0, %ymm4
+ vunpckhps %ymm1, %ymm0, %ymm5
+ vunpcklps %ymm3, %ymm2, %ymm6
+ vunpckhps %ymm3, %ymm2, %ymm7
+
+ vunpcklpd %ymm5, %ymm7, %ymm2
+ vunpckhpd %ymm5, %ymm7, %ymm3
+ vunpcklpd %ymm6, %ymm4, %ymm0
+ vunpckhpd %ymm6, %ymm4, %ymm1
+
+ vextractf128 $0x1, %ymm0, %xmm4
+ vextractf128 $0x1, %ymm1, %xmm5
+ vextractf128 $0x1, %ymm2, %xmm6
+ vextractf128 $0x1, %ymm3, %xmm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm0, %xmm0
+ vmovaps 32(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm1, %xmm1
+ vmovaps 64(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm2, %xmm2
+ vmovaps 96(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm3, %xmm3
+ vmovaps 128(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm4, %xmm4
+ vmovaps 160(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm5, %xmm5
+ vmovaps 192(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm6, %xmm6
+ vmovaps 224(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm7, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_lib8, .-inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_gen_lib8, @function
+inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_gen_lib8, .-inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_gen_lib8, @function
+inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm4
+ vmulps %ymm1, %ymm15, %ymm5
+ vmulps %ymm2, %ymm15, %ymm6
+ vmulps %ymm3, %ymm15, %ymm7
+
+ // transpose
+ vblendps $0xaa, %ymm5, %ymm4, %ymm0
+ vblendps $0xaa, %ymm5, %ymm5, %ymm1
+ vblendps $0xaa, %ymm6, %ymm7, %ymm2
+ vblendps $0xaa, %ymm7, %ymm6, %ymm3
+
+ vunpcklps %ymm1, %ymm0, %ymm4
+ vunpckhps %ymm1, %ymm0, %ymm5
+ vunpcklps %ymm3, %ymm2, %ymm6
+ vunpckhps %ymm3, %ymm2, %ymm7
+
+ vunpcklpd %ymm5, %ymm7, %ymm2
+ vunpckhpd %ymm5, %ymm7, %ymm3
+ vunpcklpd %ymm6, %ymm4, %ymm0
+ vunpckhpd %ymm6, %ymm4, %ymm1
+
+ vextractf128 $0x1, %ymm0, %xmm4
+ vextractf128 $0x1, %ymm1, %xmm5
+ vextractf128 $0x1, %ymm2, %xmm6
+ vextractf128 $0x1, %ymm3, %xmm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm0, %xmm0
+ vmovaps 32(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm1, %xmm1
+ vmovaps 64(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm2, %xmm2
+ vmovaps 96(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm3, %xmm3
+ vmovaps 128(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm4, %xmm4
+ vmovaps 160(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm5, %xmm5
+ vmovaps 192(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm6, %xmm6
+ vmovaps 224(%r12), %xmm15
+ vmulps %xmm15, %xmm14, %xmm15
+ vaddps %xmm15, %xmm7, %xmm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_gen_lib8, .-inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_8x4_lib8, @function
+inner_scale_a0_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_8x4_lib8, .-inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_lib8, @function
+inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_lib8, .-inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_gen_lib8, @function
+inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r12d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r12d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r12d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_gen_lib8, .-inner_blend_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_lib8, @function
+inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_lib8, .-inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_gen_lib8, @function
+inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %r15 // C0
+ addq %r12, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_gen_lib8, .-inner_blend_scale_11_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_lib8, @function
+inner_store_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_lib8, .-inner_store_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_lib8, @function
+inner_store_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_lib8:
+#endif
+#endif
+
+ vmovaps %xmm0, 0(%r10)
+ vmovaps %xmm1, 32(%r10)
+ vmovaps %xmm2, 64(%r10)
+ vmovaps %xmm3, 96(%r10)
+ vmovaps %xmm4, 128(%r10)
+ vmovaps %xmm5, 160(%r10)
+ vmovaps %xmm6, 192(%r10)
+ vmovaps %xmm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_lib8, .-inner_store_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_vs_lib8, @function
+inner_store_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm12, %ymm14
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm14, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmaskmovps %ymm1, %ymm14, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmaskmovps %ymm2, %ymm14, 64(%r10)
+ je 0f // end
+ vmaskmovps %ymm3, %ymm14, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_vs_lib8, .-inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_vs_lib8, @function
+inner_store_4x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %xmm14, %xmm12, %xmm14
+
+ // offset==0
+ vmaskmovps %xmm0, %xmm14, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmaskmovps %xmm1, %xmm14, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmaskmovps %xmm2, %xmm14, 64(%r10)
+ cmpl $4, %r12d
+ jl 0f // end
+ vmaskmovps %xmm3, %xmm14, 96(%r10)
+ cmpl $5, %r12d
+ jl 0f // end
+ vmaskmovps %xmm4, %xmm14, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovps %xmm5, %xmm14, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovps %xmm6, %xmm14, 192(%r10)
+ je 0f // end
+ vmaskmovps %xmm7, %xmm14, 224(%r10)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_vs_lib8, .-inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_gen_lib8, @function
+inner_store_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ je 7f // end
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_gen_lib8, @function
+inner_store_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %xmm12, %xmm14, %xmm14
+ vsubps %xmm15, %xmm12, %xmm15
+ vandps %xmm14, %xmm15, %xmm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ vmovaps %xmm6, %xmm5
+ vmovaps %xmm7, %xmm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ vmovaps %xmm6, %xmm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ addq $32, %r11
+
+ cmpl $3, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ addq $32, %r11
+
+ cmpl $4, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ addq $32, %r11
+
+ cmpl $5, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ addq $32, %r11
+
+ cmpl $6, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %xmm0, %xmm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmaskmovps %xmm1, %xmm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmaskmovps %xmm2, %xmm15, 64(%r11)
+ cmpl $4, %r15d
+ jl 7f // end
+ vmaskmovps %xmm3, %xmm15, 96(%r11)
+ cmpl $5, %r15d
+ jl 7f // end
+ vmaskmovps %xmm4, %xmm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmaskmovps %xmm5, %xmm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmaskmovps %xmm6, %xmm15, 192(%r11)
+ je 7f // end
+ vmaskmovps %xmm7, %xmm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_lib8, @function
+inner_store_l_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib8:
+#endif
+#endif
+
+ vmovaps 32(%r10), %ymm12
+ vmovaps 64(%r10), %ymm13
+ vmovaps 96(%r10), %ymm14
+
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vblendps $0x3, %ymm13, %ymm2, %ymm2
+ vblendps $0x7, %ymm14, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_lib8, .-inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_vs_lib8, @function
+inner_store_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x3, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x7, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ //
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_vs_lib8, .-inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_gen_lib8, @function
+inner_store_l_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x3, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x7, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_gen_lib8, .-inner_store_l_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_lib8
+ .type kernel_sgemm_nt_8x4_lib8, @function
+kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_lib8
+_kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_lib8
+ .def kernel_sgemm_nt_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_lib8, .-kernel_sgemm_nt_8x4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_4x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_lib8
+ .type kernel_sgemm_nt_4x8_lib8, @function
+kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_lib8
+_kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_lib8
+ .def kernel_sgemm_nt_4x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG3, %r12 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_lib8, .-kernel_sgemm_nt_4x8_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_vs_lib8
+ .type kernel_sgemm_nt_8x4_vs_lib8, @function
+kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_vs_lib8
+_kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_vs_lib8
+ .def kernel_sgemm_nt_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_vs_lib8, .-kernel_sgemm_nt_8x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_4x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_vs_lib8
+ .type kernel_sgemm_nt_4x8_vs_lib8, @function
+kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_vs_lib8
+_kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_vs_lib8
+ .def kernel_sgemm_nt_4x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG3, %r12 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_vs_lib8, .-kernel_sgemm_nt_4x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_8x4_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_gen_lib8
+ .type kernel_sgemm_nt_8x4_gen_lib8, @function
+kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_gen_lib8
+_kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_gen_lib8
+ .def kernel_sgemm_nt_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_gen_lib8, .-kernel_sgemm_nt_8x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_4x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_gen_lib8
+ .type kernel_sgemm_nt_4x8_gen_lib8, @function
+kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_gen_lib8
+_kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_gen_lib8
+ .def kernel_sgemm_nt_4x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG3, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_gen_lib8, .-kernel_sgemm_nt_4x8_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_lib8
+ .type kernel_sgemm_nn_8x4_lib8, @function
+kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_lib8
+_kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_lib8
+ .def kernel_sgemm_nn_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_lib8, .-kernel_sgemm_nn_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_vs_lib8
+ .type kernel_sgemm_nn_8x4_vs_lib8, @function
+kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_vs_lib8
+_kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_vs_lib8
+ .def kernel_sgemm_nn_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_vs_lib8, .-kernel_sgemm_nn_8x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_sgemm_nn_8x4_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_gen_lib8
+ .type kernel_sgemm_nn_8x4_gen_lib8, @function
+kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_gen_lib8
+_kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_gen_lib8
+ .def kernel_sgemm_nn_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_gen_lib8, .-kernel_sgemm_nn_8x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_ssyrk_nt_l_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x4_lib8
+ .type kernel_ssyrk_nt_l_8x4_lib8, @function
+kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x4_lib8
+_kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x4_lib8
+ .def kernel_ssyrk_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x4_lib8, .-kernel_ssyrk_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_nt_l_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x4_vs_lib8
+ .type kernel_ssyrk_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x4_vs_lib8
+_kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x4_vs_lib8
+ .def kernel_ssyrk_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x4_vs_lib8, .-kernel_ssyrk_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_strsm_nt_rl_inv_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x4_lib8
+ .type kernel_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x4_lib8
+_kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x4_lib8
+ .def kernel_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x4_lib8, .-kernel_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_strsm_nt_rl_inv_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+ .type kernel_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+ .def kernel_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9
+// void kernel_spotrf_nt_l_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x4_lib8
+ .type kernel_spotrf_nt_l_8x4_lib8, @function
+kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x4_lib8
+_kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x4_lib8
+ .def kernel_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x4_lib8, .-kernel_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_spotrf_nt_l_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x4_vs_lib8
+ .type kernel_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x4_vs_lib8
+_kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x4_vs_lib8
+ .def kernel_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // m1
+ movq ARG8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x4_vs_lib8, .-kernel_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_spotrf_nt_l_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x4_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x4_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_strmm_nn_rl_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_lib8
+ .type kernel_strmm_nn_rl_8x4_lib8, @function
+kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_lib8
+_kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_lib8
+ .def kernel_strmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_lib8, .-kernel_strmm_nn_rl_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strmm_nn_rl_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_vs_lib8
+ .type kernel_strmm_nn_rl_8x4_vs_lib8, @function
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_vs_lib8
+_kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_vs_lib8
+ .def kernel_strmm_nn_rl_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_vs_lib8, .-kernel_strmm_nn_rl_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_strmm_nn_rl_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_gen_lib8
+ .type kernel_strmm_nn_rl_8x4_gen_lib8, @function
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_gen_lib8
+_kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_gen_lib8
+ .def kernel_strmm_nn_rl_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // offsetD
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG10, %r13 // m0
+ movq ARG11, %r14 // m1
+ movq ARG12, %r15 // n0
+ movq ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_gen_lib8, .-kernel_strmm_nn_rl_8x4_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_8x8_lib8.S b/kernel/avx/kernel_sgemm_8x8_lib8.S
new file mode 100644
index 0000000..354fa83
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_8x8_lib8.S
@@ -0,0 +1,5514 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_8x8_lib8, @function
+inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 32(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 96(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 32(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 96(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+// vbroadcastf128 0(%r12), %ymm14 // B
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vaddps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+// vbroadcastf128 16(%r12), %ymm15 // B
+ vaddps %ymm11, %ymm7, %ymm7
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm2, %ymm2
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm3, %ymm3
+
+ vbroadcastf128 16(%r12), %ymm14 // B
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm4, %ymm4
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm5, %ymm5
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm6, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm7, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_8x8_lib8, .-inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_8x8_lib8, @function
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 32(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 96(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 32(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm12, %ymm14, %ymm11
+ vbroadcastf128 96(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm12, %ymm15, %ymm11
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vmulps %ymm13, %ymm14, %ymm11
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vmulps %ymm13, %ymm14, %ymm11
+// vbroadcastf128 0(%r12), %ymm14 // B
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vmulps %ymm13, %ymm15, %ymm11
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+ vsubps %ymm11, %ymm6, %ymm6
+
+ vmulps %ymm13, %ymm15, %ymm11
+// vbroadcastf128 16(%r12), %ymm15 // B
+ vsubps %ymm11, %ymm7, %ymm7
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm2, %ymm2
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm3, %ymm3
+
+ vbroadcastf128 16(%r12), %ymm14 // B
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm4, %ymm4
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm5, %ymm5
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm6, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vsubps %ymm11, %ymm7, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_8x8_lib8, .-inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_8x8_lib8, @function
+inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r12, %r14 // B_next <- B
+ addq %r13, %r14 // B_next <- B + 4*sda*sizeof(double)
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r14) // software prefetch
+ prefetcht0 64(%r14) // software prefetch
+ prefetcht0 128(%r14) // software prefetch
+ prefetcht0 192(%r14) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 132(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 164(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 196(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 228(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 136(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 168(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 200(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 232(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 140(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 172(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 204(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 236(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 144(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 176(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 208(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 240(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 148(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 180(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 212(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 244(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 152(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 184(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 216(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 248(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+
+ // unroll 7
+ vmovaps 224(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 156(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 188(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 220(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 252(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ subl $8, %r10d
+ addq $256, %r11
+
+ mov %r14, %r12
+ addq %r13, %r14
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_8x8_lib8, .-inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_8x8_lib8, @function
+inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %ebx
+ subl %r14d, %ebx // 8-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,8-offsetB)
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r12 // B+offsetB*sizeof(float)
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vmulps %ymm12, %ymm13, %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_8x8_lib8, .-inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x8_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vbroadcastss 16(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vbroadcastss 20(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 24(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 28(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vbroadcastss 48(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vbroadcastss 52(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 56(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 60(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vbroadcastss 80(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vbroadcastss 84(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 88(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 92(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vbroadcastss 112(%r10), %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vbroadcastss 116(%r10), %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 120(%r10), %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 124(%r10), %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 16(%r11), %ymm13
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $6, %r12d
+ jl 0f // ret
+ vbroadcastss 148(%r10), %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vbroadcastss 152(%r10), %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 156(%r10), %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 20(%r11), %ymm13
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $7, %r12d
+ jl 0f // ret
+ vbroadcastss 184(%r10), %ymm13
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vbroadcastss 188(%r10), %ymm13
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 24(%r11), %ymm13
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $8, %r12d
+ jl 0f // ret
+ vbroadcastss 220(%r10), %ymm13
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+ vbroadcastss 28(%r11), %ymm13
+ vmulps %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x8_vs_lib8, .-inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x8_vs_lib8, @function
+inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x8_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm4, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm3, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm4, %xmm13
+// vpermilps $0x00, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 9f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+10:
+ vmovsd %xmm13, 16(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $6, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm5, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm4, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm5, %xmm13
+ vpermilps $0x55, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 11f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+12:
+ vmovsd %xmm13, 20(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $7, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm6, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm5, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm6, %xmm13
+ vpermilps $0xaa, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 13f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+14:
+ vmovsd %xmm13, 24(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $8, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm6, %ymm13, %ymm12
+ vsubps %ymm12, %ymm7, %ymm7
+
+
+ vextractf128 $0x1, %ymm7, %xmm13
+ vpermilps $0xff, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 15f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+16:
+ vmovsd %xmm13, 28(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm7, %ymm13, %ymm7
+
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+9:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 10b
+
+11:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 12b
+
+13:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 14b
+
+15:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 16b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x8_vs_lib8, .-inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x8_lib8, @function
+inner_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmovaps 128(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x8_lib8, .-inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x8_gen_lib8, @function
+inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+ vmovaps 128(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x8_gen_lib8, .-inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x8_lib8, @function
+inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmovaps 128(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x8_lib8, .-inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x8_gen_lib8, @function
+inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+ vmovaps 128(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r13), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x8_gen_lib8, .-inner_blend_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x8_lib8, @function
+inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmovaps 128(%r10), %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r10), %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r10), %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r10), %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x8_lib8, .-inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x8_gen_lib8, @function
+inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+ vmovaps 128(%r11), %ymm12
+ vaddps %ymm4, %ymm12, %ymm4
+ vmovaps 160(%r11), %ymm12
+ vaddps %ymm5, %ymm12, %ymm5
+ vmovaps 192(%r11), %ymm12
+ vaddps %ymm6, %ymm12, %ymm6
+ vmovaps 224(%r11), %ymm12
+ vaddps %ymm7, %ymm12, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x8_gen_lib8, .-inner_blend_scale_11_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_lib8, @function
+inner_store_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+ vmovaps %ymm4, 128(%r10)
+ vmovaps %ymm5, 160(%r10)
+ vmovaps %ymm6, 192(%r10)
+ vmovaps %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_lib8, .-inner_store_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_vs_lib8, @function
+inner_store_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ vmaskmovps %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovps %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovps %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmaskmovps %ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_gen_lib8, @function
+inner_store_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ vmaskmovps %ymm4, %ymm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmaskmovps %ymm5, %ymm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmaskmovps %ymm6, %ymm15, 192(%r11)
+ je 7f // end
+ vmaskmovps %ymm7, %ymm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_lib8, @function
+inner_store_l_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps 32(%r10), %ymm14
+ vblendps $0x01, %ymm14, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmovaps 64(%r10), %ymm14
+ vblendps $0x03, %ymm14, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmovaps 96(%r10), %ymm14
+ vblendps $0x07, %ymm14, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmovaps 128(%r10), %ymm14
+ vblendps $0x0f, %ymm14, %ymm4, %ymm4
+ vmovaps %ymm4, 128(%r10)
+ vmovaps 160(%r10), %ymm14
+ vblendps $0x1f, %ymm14, %ymm5, %ymm5
+ vmovaps %ymm5, 160(%r10)
+ vmovaps 192(%r10), %ymm14
+ vblendps $0x3f, %ymm14, %ymm6, %ymm6
+ vmovaps %ymm6, 192(%r10)
+ vmovaps 224(%r10), %ymm14
+ vblendps $0x7f, %ymm14, %ymm7, %ymm7
+ vmovaps %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x8_lib8, .-inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_vs_lib8, @function
+inner_store_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ vmovaps 128(%r10), %ymm12
+ vblendps $0x0f, %ymm12, %ymm4, %ymm4
+ vmaskmovps %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmovaps 160(%r10), %ymm12
+ vblendps $0x1f, %ymm12, %ymm5, %ymm5
+ vmaskmovps %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmovaps 192(%r10), %ymm12
+ vblendps $0x3f, %ymm12, %ymm6, %ymm6
+ vmaskmovps %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmovaps 224(%r10), %ymm12
+ vblendps $0x7f, %ymm12, %ymm7, %ymm7
+ vmaskmovps %ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_gen_lib8, @function
+inner_store_l_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ vmovaps 128(%r11), %ymm12
+ vblendps $0x0f, %ymm12, %ymm4, %ymm4
+ vmaskmovps %ymm4, %ymm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmovaps 160(%r11), %ymm12
+ vblendps $0x1f, %ymm12, %ymm5, %ymm5
+ vmaskmovps %ymm5, %ymm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmovaps 192(%r11), %ymm12
+ vblendps $0x3f, %ymm12, %ymm6, %ymm6
+ vmaskmovps %ymm6, %ymm15, 192(%r11)
+ je 7f // end
+ vmovaps 224(%r11), %ymm12
+ vblendps $0x7f, %ymm12, %ymm7, %ymm7
+ vmaskmovps %ymm7, %ymm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_lib8
+ .type kernel_sgemm_nt_8x8_lib8, @function
+kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_lib8
+_kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_lib8
+ .def kernel_sgemm_nt_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_lib8, .-kernel_sgemm_nt_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemm_nt_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_vs_lib8
+ .type kernel_sgemm_nt_8x8_vs_lib8, @function
+kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_vs_lib8
+_kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_vs_lib8
+ .def kernel_sgemm_nt_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_vs_lib8, .-kernel_sgemm_nt_8x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_gen_lib8
+ .type kernel_sgemm_nt_8x8_gen_lib8, @function
+kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_gen_lib8
+_kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_gen_lib8
+ .def kernel_sgemm_nt_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_gen_lib8, .-kernel_sgemm_nt_8x8_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_lib8
+ .type kernel_sgemm_nn_8x8_lib8, @function
+kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_lib8
+_kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_lib8
+ .def kernel_sgemm_nn_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_lib8, .-kernel_sgemm_nn_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_vs_lib8
+ .type kernel_sgemm_nn_8x8_vs_lib8, @function
+kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_vs_lib8
+_kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_vs_lib8
+ .def kernel_sgemm_nn_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_vs_lib8, .-kernel_sgemm_nn_8x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_sgemm_nn_8x8_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_gen_lib8
+ .type kernel_sgemm_nn_8x8_gen_lib8, @function
+kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_gen_lib8
+_kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_gen_lib8
+ .def kernel_sgemm_nn_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_gen_lib8, .-kernel_sgemm_nn_8x8_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x8_lib8
+ .type kernel_ssyrk_nt_l_8x8_lib8, @function
+kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x8_lib8
+_kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x8_lib8
+ .def kernel_ssyrk_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x8_lib8, .-kernel_ssyrk_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x8_vs_lib8
+ .type kernel_ssyrk_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x8_vs_lib8
+_kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x8_vs_lib8
+ .def kernel_ssyrk_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x8_vs_lib8, .-kernel_ssyrk_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x8_lib8
+ .type kernel_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x8_lib8
+_kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x8_lib8
+ .def kernel_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movl $8, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x8_lib8, .-kernel_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+ .type kernel_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+ .def kernel_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // m1
+ movq ARG9, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq $8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9
+// void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x8_lib8
+ .type kernel_spotrf_nt_l_8x8_lib8, @function
+kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x8_lib8
+_kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x8_lib8
+ .def kernel_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movl $8, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x8_lib8, .-kernel_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x8_vs_lib8
+ .type kernel_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x8_vs_lib8
+_kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x8_vs_lib8
+ .def kernel_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // m1
+ movq ARG8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x8_vs_lib8, .-kernel_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x8_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x8_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $8, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x8_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgemm_diag_lib8.c b/kernel/avx/kernel_sgemm_diag_lib8.c
new file mode 100644
index 0000000..63183b2
--- /dev/null
+++ b/kernel/avx/kernel_sgemm_diag_lib8.c
@@ -0,0 +1,480 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+
+
+// B is the diagonal of a matrix, beta==0.0 case
+void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 8;
+
+ int k;
+
+ __m256
+ alpha0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22, b_33,
+ d_00, d_01, d_02, d_03;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+ b_11 = _mm256_broadcast_ss( &B[1] );
+ b_11 = _mm256_mul_ps( b_11, alpha0 );
+ b_22 = _mm256_broadcast_ss( &B[2] );
+ b_22 = _mm256_mul_ps( b_22, alpha0 );
+ b_33 = _mm256_broadcast_ss( &B[3] );
+ b_33 = _mm256_mul_ps( b_33, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+ a_00 = _mm256_load_ps( &A[24] );
+ d_03 = _mm256_mul_ps( a_00, b_33 );
+
+ _mm256_store_ps( &D[0], d_00 );
+ _mm256_store_ps( &D[8], d_01 );
+ _mm256_store_ps( &D[16], d_02 );
+ _mm256_store_ps( &D[24], d_03 );
+
+ A += 8*sda;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+ a_00 = _mm256_load_ps( &A[24] );
+ d_03 = _mm256_mul_ps( a_00, b_33 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+ _mm256_maskstore_ps( &D[8], mask_i, d_01 );
+ _mm256_maskstore_ps( &D[16], mask_i, d_02 );
+ _mm256_maskstore_ps( &D[24], mask_i, d_03 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 8;
+
+ int k;
+
+ __m256
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22, b_33,
+ c_00,
+ d_00, d_01, d_02, d_03;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+ beta0 = _mm256_broadcast_ss( beta );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+ b_11 = _mm256_broadcast_ss( &B[1] );
+ b_11 = _mm256_mul_ps( b_11, alpha0 );
+ b_22 = _mm256_broadcast_ss( &B[2] );
+ b_22 = _mm256_mul_ps( b_22, alpha0 );
+ b_33 = _mm256_broadcast_ss( &B[3] );
+ b_33 = _mm256_mul_ps( b_33, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+ a_00 = _mm256_load_ps( &A[24] );
+ d_03 = _mm256_mul_ps( a_00, b_33 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+ c_00 = _mm256_load_ps( &C[16] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_02 = _mm256_add_ps( c_00, d_02 );
+ c_00 = _mm256_load_ps( &C[24] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_03 = _mm256_add_ps( c_00, d_03 );
+
+ _mm256_store_ps( &D[0], d_00 );
+ _mm256_store_ps( &D[8], d_01 );
+ _mm256_store_ps( &D[16], d_02 );
+ _mm256_store_ps( &D[24], d_03 );
+
+ A += 8*sda;
+ C += 8*sdc;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+ a_00 = _mm256_load_ps( &A[24] );
+ d_03 = _mm256_mul_ps( a_00, b_33 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+ c_00 = _mm256_load_ps( &C[16] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_02 = _mm256_add_ps( c_00, d_02 );
+ c_00 = _mm256_load_ps( &C[24] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_03 = _mm256_add_ps( c_00, d_03 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+ _mm256_maskstore_ps( &D[8], mask_i, d_01 );
+ _mm256_maskstore_ps( &D[16], mask_i, d_02 );
+ _mm256_maskstore_ps( &D[24], mask_i, d_03 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 8;
+
+ int k;
+
+ __m256
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11, b_22,
+ c_00,
+ d_00, d_01, d_02;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+ beta0 = _mm256_broadcast_ss( beta );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+ b_11 = _mm256_broadcast_ss( &B[1] );
+ b_11 = _mm256_mul_ps( b_11, alpha0 );
+ b_22 = _mm256_broadcast_ss( &B[2] );
+ b_22 = _mm256_mul_ps( b_22, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+ c_00 = _mm256_load_ps( &C[16] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_02 = _mm256_add_ps( c_00, d_02 );
+
+ _mm256_store_ps( &D[0], d_00 );
+ _mm256_store_ps( &D[8], d_01 );
+ _mm256_store_ps( &D[16], d_02 );
+
+ A += 8*sda;
+ C += 8*sdc;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+ a_00 = _mm256_load_ps( &A[16] );
+ d_02 = _mm256_mul_ps( a_00, b_22 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+ c_00 = _mm256_load_ps( &C[16] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_02 = _mm256_add_ps( c_00, d_02 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+ _mm256_maskstore_ps( &D[8], mask_i, d_01 );
+ _mm256_maskstore_ps( &D[16], mask_i, d_02 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00, b_11,
+ c_00,
+ d_00, d_01;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+ beta0 = _mm256_broadcast_ss( beta );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+ b_11 = _mm256_broadcast_ss( &B[1] );
+ b_11 = _mm256_mul_ps( b_11, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+
+ _mm256_store_ps( &D[0], d_00 );
+ _mm256_store_ps( &D[8], d_01 );
+
+ A += 8*sda;
+ C += 8*sdc;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+ a_00 = _mm256_load_ps( &A[8] );
+ d_01 = _mm256_mul_ps( a_00, b_11 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+ c_00 = _mm256_load_ps( &C[8] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_01 = _mm256_add_ps( c_00, d_01 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+ _mm256_maskstore_ps( &D[8], mask_i, d_01 );
+
+ }
+
+ }
+
+
+
+// B is the diagonal of a matrix
+void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ __m256
+ alpha0, beta0,
+ mask_f,
+ sign,
+ a_00,
+ b_00,
+ c_00,
+ d_00;
+
+ __m256i
+ mask_i;
+
+ alpha0 = _mm256_broadcast_ss( alpha );
+ beta0 = _mm256_broadcast_ss( beta );
+
+ b_00 = _mm256_broadcast_ss( &B[0] );
+ b_00 = _mm256_mul_ps( b_00, alpha0 );
+
+ for(k=0; k<kmax-7; k+=8)
+ {
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+
+ _mm256_store_ps( &D[0], d_00 );
+
+ A += 8*sda;
+ C += 8*sdc;
+ D += 8*sdd;
+
+ }
+ if(k<kmax)
+ {
+
+ const float mask_f[] = {0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};
+ float m_f = kmax-k;
+
+ mask_i = _mm256_castps_si256( _mm256_sub_ps( _mm256_loadu_ps( mask_f ), _mm256_broadcast_ss( &m_f ) ) );
+
+ a_00 = _mm256_load_ps( &A[0] );
+ d_00 = _mm256_mul_ps( a_00, b_00 );
+
+ c_00 = _mm256_load_ps( &C[0] );
+ c_00 = _mm256_mul_ps( c_00, beta0 );
+ d_00 = _mm256_add_ps( c_00, d_00 );
+
+ _mm256_maskstore_ps( &D[0], mask_i, d_00 );
+
+ }
+
+ }
+
+
+
+
diff --git a/kernel/avx/kernel_sgemv_4_lib8.S b/kernel/avx/kernel_sgemv_4_lib8.S
new file mode 100644
index 0000000..1508ebe
--- /dev/null
+++ b/kernel/avx/kernel_sgemv_4_lib8.S
@@ -0,0 +1,2935 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemv_add_t_4_lib8, @function
+inner_kernel_gemv_add_t_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_t_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemv_add_t_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_t_4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r13), %ymm12
+
+ vmovaps 0(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ subl $8, %r10d
+
+ vmovaps 32(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $7, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2ss %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm14
+
+ vmaskmovps 0(%r13), %ymm14, %ymm12
+
+ vmaskmovps 0(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ vmaskmovps 32(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmaskmovps 64(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmaskmovps 96(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ sall $2, %r10d // *sizeof(float)
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemv_add_t_4_lib8, .-inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemv_add_nt_4_lib8, @function
+inner_kernel_gemv_add_nt_4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_nt_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemv_add_nt_4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_nt_4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r13), %ymm12
+ vmovups 0(%r14), %ymm13
+
+ vmovaps 0(%r11), %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmulps %ymm14, %ymm6, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ subl $8, %r10d
+
+ vmovaps 32(%r11), %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmulps %ymm14, %ymm7, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmovaps 64(%r11), %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmulps %ymm14, %ymm8, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmovaps 96(%r11), %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmulps %ymm14, %ymm9, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmovups %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ cmpl $7, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2ss %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x0, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm11
+
+ vmaskmovps 0(%r13), %ymm11, %ymm12
+ vmaskmovps 0(%r14), %ymm11, %ymm13
+
+// vmovups %ymm14, -32(%rsp) // spill mask to stack
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps 0(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmulps %ymm14, %ymm6, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps 32(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmulps %ymm14, %ymm7, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps 64(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmulps %ymm14, %ymm8, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps 96(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmulps %ymm14, %ymm9, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+// vmovups -32(%rsp), %ymm14 // load mask form stack
+ vmaskmovps %ymm13, %ymm11, 0(%r14)
+
+ sall $2, %r10d // *sizeof(float)
+ addq %r10, %r11
+ addq %r10, %r13
+ addq %r10, %r14
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemv_add_nt_4_lib8, .-inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <-
+// r11 <-
+// r12 <-
+// r13 <-
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_GEMV_ADD_T_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemv_add_t_4_lib8, @function
+inner_edge_gemv_add_t_4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemv_add_t_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemv_add_t_4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemv_add_t_4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jle 0f // return
+
+ movl %r14d, %r15d
+ sall $2, %r15d // offA*sizeof(float)
+
+ subq %r15, %r11 // A - offA
+ subq %r15, %r13 // x - offA
+
+ movl %r10d, %r15d // kmax
+ addl %r14d, %r15d // kmax + offA
+
+ vcvtsi2ss %r14d, %xmm14, %xmm14 // offA
+ vcvtsi2ss %r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm13, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+ vandps %ymm15, %ymm14, %ymm14
+
+ vmaskmovps 0(%r13), %ymm14, %ymm12
+
+ vmovaps 0(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ vmovaps 32(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ addq $32, %r13 // x + 4
+ addq %r12, %r11 // A + bs*sda
+
+ addl %r14d, %r10d
+ subl $8, %r10d // kmax - (8-offA)
+
+0: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemv_add_t_4_lib8, .-inner_edge_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+
+
+
+#if 0
+// TODO
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_lt_inv_8_lib8, @function
+inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x01, %ymm14, %ymm12, %ymm12
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x07, %ymm14, %ymm12, %ymm12
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ vmovaps 144(%r10), %xmm12
+ vblendps $0x01, %xmm14, %xmm12, %xmm12
+ vmovaps 176(%r10), %xmm13
+ vblendps $0x03, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+ vmovaps 208(%r10), %xmm12
+ vblendps $0x07, %xmm14, %xmm12, %xmm12
+ vmovaps 240(%r10), %xmm13
+ vblendps $0x0f, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vshufps $0xff, %xmm1, %xmm1, %xmm2
+ vbroadcastss 28(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm1, %xmm1
+ vmulps %xmm10, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm13, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0xaa, %xmm1, %xmm1, %xmm2
+ vbroadcastss 24(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm1, %xmm1
+ vmulps %xmm9, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm12, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0x55, %xmm1, %xmm1, %xmm2
+ vbroadcastss 20(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm1, %xmm1
+ vmulps %xmm8, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm11, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0x00, %xmm1, %xmm1, %xmm2
+ vbroadcastss 16(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm1, %xmm1
+ vmulps %xmm7, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0xff, %xmm0, %xmm0, %xmm2
+ vbroadcastss 12(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm0, %xmm0
+ vmulps %xmm6, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0xaa, %xmm0, %xmm0, %xmm2
+ vbroadcastss 8(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm0, %xmm0
+ vmulps %xmm5, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0x55, %xmm0, %xmm0, %xmm2
+ vbroadcastss 4(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm0, %xmm0
+ vmulps %xmm4, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0x00, %xmm0, %xmm0, %xmm2
+ vbroadcastss 0(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm0, %xmm0
+
+ vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_lt_inv_8_lib8, .-inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// r13 <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// r13 <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_lt_inv_8_vs_lib8, @function
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x01, %ymm14, %ymm12, %ymm12
+ cmpl $2, %r13d
+ jl 1f
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+ cmpl $3, %r13d
+ jl 2f
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x07, %ymm14, %ymm12, %ymm12
+ cmpl $4, %r13d
+ jl 3f
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ cmpl $5, %r13d
+ jl 4f
+ vmovaps 144(%r10), %xmm12
+ vblendps $0x01, %xmm14, %xmm12, %xmm12
+ cmpl $6, %r13d
+ jl 5f
+ vmovaps 176(%r10), %xmm13
+ vblendps $0x03, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+ cmpl $7, %r13d
+ jl 6f
+ vmovaps 208(%r10), %xmm12
+ vblendps $0x07, %xmm14, %xmm12, %xmm12
+ cmpl $8, %r13d
+ jl 7f
+ vmovaps 240(%r10), %xmm13
+ vblendps $0x0f, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+ jmp 0f
+
+
+
+ vmovaps %ymm14, %ymm12
+1:
+ vmovaps %ymm14, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+2:
+ vmovaps %ymm14, %ymm12
+3:
+ vmovaps %ymm14, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ jmp 8f
+
+4:
+ vmovaps %xmm14, %xmm12
+5:
+ vmovaps %xmm14, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+6:
+ vmovaps %xmm14, %xmm12
+7:
+ vmovaps %xmm14, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+8:
+
+ vmovaps %xmm14, %xmm11
+ vmovaps %xmm14, %xmm12
+ vmovaps %xmm14, %xmm13
+
+0:
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ cmpl $8, %r12d
+ jl 0f
+
+ vshufps $0xff, %xmm1, %xmm1, %xmm2
+ vbroadcastss 28(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm1, %xmm1
+ vmulps %xmm10, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm13, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $7, %r12d
+ jl 0f
+
+ vshufps $0xaa, %xmm1, %xmm1, %xmm2
+ vbroadcastss 24(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm1, %xmm1
+ vmulps %xmm9, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm12, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $6, %r12d
+ jl 0f
+
+ vshufps $0x55, %xmm1, %xmm1, %xmm2
+ vbroadcastss 20(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm1, %xmm1
+ vmulps %xmm8, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm11, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $5, %r12d
+ jl 0f
+
+ vshufps $0x00, %xmm1, %xmm1, %xmm2
+ vbroadcastss 16(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm1, %xmm1
+ vmulps %xmm7, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $4, %r12d
+ jl 0f
+
+ vshufps $0xff, %xmm0, %xmm0, %xmm2
+ vbroadcastss 12(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm0, %xmm0
+ vmulps %xmm6, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $3, %r12d
+ jl 0f
+
+ vshufps $0xaa, %xmm0, %xmm0, %xmm2
+ vbroadcastss 8(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm0, %xmm0
+ vmulps %xmm5, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $2, %r12d
+ jl 0f
+
+ vshufps $0x55, %xmm0, %xmm0, %xmm2
+ vbroadcastss 4(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm0, %xmm0
+ vmulps %xmm4, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $1, %r12d
+ jl 0f
+
+ vshufps $0x00, %xmm0, %xmm0, %xmm2
+ vbroadcastss 0(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm0, %xmm0
+
+0:
+
+ vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_lt_inv_8_vs_lib8, .-inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// r15 <- offA
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// r15 <- offA
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_symv_add_nt_4l_lib8, @function
+inner_edge_symv_add_nt_4l_lib8:
+#elif defined(OS_MAC)
+_inner_edge_symv_add_nt_4l_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_symv_add_nt_4l_lib8; .scl 2; .type 32; .endef
+inner_edge_symv_add_nt_4l_lib8:
+#endif
+#endif
+
+ movl $8, %eax
+ cmpl %eax, %r10d
+ jge 0f
+ movl %r10d, %eax
+0:
+ subl %r15d, %eax
+
+ vcvtsi2ss %eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x0, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm11
+
+ vmaskmovps 0(%r13), %ymm11, %ymm12
+ vmaskmovps 0(%r14), %ymm11, %ymm13
+
+ vmaskmovps 0(%r11), %ymm11, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x01, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm6, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmaskmovps 32(%r11), %ymm11, %ymm14
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x01, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x03, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm7, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmaskmovps 64(%r11), %ymm11, %ymm14
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x03, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x07, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm8, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmaskmovps 96(%r11), %ymm11, %ymm14
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x07, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vxorps %ymm15, %ymm15, %ymm15
+ vblendps $0x0f, %ymm15, %ymm14, %ymm14
+ vmulps %ymm14, %ymm9, %ymm15
+ vaddps %ymm13, %ymm15, %ymm13
+
+ vmaskmovps %ymm13, %ymm11, 0(%r14)
+
+ subl %eax, %r10d
+
+ salq $2, %rax // *sizeof(float)
+ addq %rax, %r11
+ subq $32, %r11
+ addq %r12, %r11
+ addq %rax, %r13
+ addq %rax, %r14
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_symv_add_nt_4l_lib8, .-inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+
+
+
+
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_symv_add_nt_4r_lib8, @function
+inner_edge_symv_add_nt_4r_lib8:
+#elif defined(OS_MAC)
+_inner_edge_symv_add_nt_4r_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_symv_add_nt_4r_lib8; .scl 2; .type 32; .endef
+inner_edge_symv_add_nt_4r_lib8:
+#endif
+#endif
+
+ movl $4, %eax
+ cmpl %eax, %r10d
+ jge 0f
+ movl %r10d, %eax
+0:
+ subl %r15d, %eax
+
+ vcvtsi2ss %eax, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm13
+#endif
+ vshufps $0x0, %xmm14, %xmm14, %xmm14
+// vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %xmm14, %xmm13, %xmm11
+
+ vmaskmovps 0(%r13), %xmm11, %xmm12
+ vmaskmovps 0(%r14), %xmm11, %xmm13
+
+ vmaskmovps 0(%r11), %xmm11, %xmm14
+ vmulps %xmm14, %xmm12, %xmm15
+ vaddps %xmm0, %xmm15, %xmm0
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x01, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm6, %xmm15
+ vaddps %xmm13, %xmm15, %xmm13
+
+ vmaskmovps 32(%r11), %xmm11, %xmm14
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x01, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm12, %xmm15
+ vaddps %xmm1, %xmm15, %xmm1
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x03, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm7, %xmm15
+ vaddps %xmm13, %xmm15, %xmm13
+
+ vmaskmovps 64(%r11), %xmm11, %xmm14
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x03, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm12, %xmm15
+ vaddps %xmm2, %xmm15, %xmm2
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x07, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm8, %xmm15
+ vaddps %xmm13, %xmm15, %xmm13
+
+ vmaskmovps 96(%r11), %xmm11, %xmm14
+ vxorps %xmm15, %xmm15, %xmm15
+ vblendps $0x07, %xmm15, %xmm14, %xmm14
+ vmulps %xmm14, %xmm12, %xmm15
+ vaddps %xmm3, %xmm15, %xmm3
+// vxorps %xmm15, %xmm15, %xmm15
+// vblendps $0x0f, %xmm15, %xmm14, %xmm14
+// vmulps %xmm14, %xmm9, %xmm15
+// vaddps %xmm13, %xmm15, %xmm13
+
+ vmaskmovps %xmm13, %xmm11, 0(%r14)
+
+ subl %eax, %r10d
+
+ salq $2, %rax // *sizeof(float)
+ addq %rax, %r11
+ subq $32, %r11
+ addq %r12, %r11
+ addq %rax, %r13
+ addq %rax, %r14
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_symv_add_nt_4r_lib8, .-inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_4_lib8, @function
+inner_blend_t_scale_ab_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_4_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+
+ vhaddps %ymm2, %ymm0, %ymm0
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vaddps %xmm0, %xmm1, %xmm0
+
+ // alpha
+ vbroadcastss 0(%r10), %xmm15
+ vmulps %xmm0, %xmm15, %xmm0
+
+ // beta
+ vbroadcastss 0(%r11), %xmm15
+ vmovups 0(%r12), %xmm14
+ vmulps %xmm15, %xmm14, %xmm14
+ vaddps %xmm0, %xmm14, %xmm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_4_lib8, .-inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_a1_4_lib8, @function
+inner_blend_t_scale_a1_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_a1_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+
+ vhaddps %ymm2, %ymm0, %ymm0
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vaddps %xmm0, %xmm1, %xmm0
+
+ // alpha
+ vbroadcastss 0(%r10), %xmm15
+ vmulps %xmm0, %xmm15, %xmm0
+
+ // beta
+ vmovups 0(%r11), %xmm14
+ vaddps %xmm0, %xmm14, %xmm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_a1_4_lib8, .-inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_M11_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_m11_4_lib8, @function
+inner_blend_t_scale_m11_4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_m11_4_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_4_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+
+ vhaddps %ymm2, %ymm0, %ymm0
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vaddps %xmm0, %xmm1, %xmm0
+
+ // beta
+ vmovups 0(%r10), %xmm14
+ vsubps %xmm0, %xmm14, %xmm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_m11_4_lib8, .-inner_blend_t_scale_m11_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_lib8, @function
+inner_store_4_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_lib8; .scl 2; .type 32; .endef
+inner_store_4_lib8:
+#endif
+#endif
+
+ vmovups %xmm0, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_lib8, .-inner_store_4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_vs_lib8, @function
+inner_store_4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_4_vs_lib8:
+#endif
+#endif
+
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm14
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm14
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+// vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %xmm15, %xmm14, %xmm15
+
+ vmaskmovps %xmm0, %xmm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_vs_lib8, .-inner_store_4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4_gen_lib8, @function
+inner_store_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+// vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+// vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %xmm12, %xmm14, %xmm14
+ vsubps %xmm15, %xmm12, %xmm15
+ vandps %xmm14, %xmm15, %xmm15
+
+ vmaskmovps %xmm0, %xmm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4_gen_lib8, .-inner_store_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_sgemv_t_4_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_4_lib8
+ .type kernel_sgemv_t_4_lib8, @function
+kernel_sgemv_t_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_4_lib8
+_kernel_sgemv_t_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_4_lib8
+ .def kernel_sgemv_t_4_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_4_lib8, .-kernel_sgemv_t_4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemv_t_4_vs_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_4_vs_lib8
+ .type kernel_sgemv_t_4_vs_lib8, @function
+kernel_sgemv_t_4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_4_vs_lib8
+_kernel_sgemv_t_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_4_vs_lib8
+ .def kernel_sgemv_t_4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+ movq ARG9, %r11 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_4_vs_lib8, .-kernel_sgemv_t_4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemv_t_4_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_4_gen_lib8
+ .type kernel_sgemv_t_4_gen_lib8, @function
+kernel_sgemv_t_4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_4_gen_lib8
+_kernel_sgemv_t_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_4_gen_lib8
+ .def kernel_sgemv_t_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG6, %r13 // x
+ movq ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_GEMV_ADD_T_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemv_add_t_4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_4_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG9, %r10 // z
+ movq ARG10, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_4_gen_lib8, .-kernel_sgemv_t_4_gen_lib8
+#endif
+
+
+
+
+
+#if 0
+// TODO
+
+// 1 2 3 4 5 6 7
+// void kernel_strsv_lt_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_lt_inv_8_lib8
+ .type kernel_strsv_lt_inv_8_lib8, @function
+kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_lt_inv_8_lib8
+_kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_lt_inv_8_lib8
+ .def kernel_strsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $8, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 8*sda*sizeof(float)
+ addq %r12, %r11 // A+8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+8
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_lt_inv_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_lt_inv_8_lib8, .-kernel_strsv_lt_inv_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strsv_lt_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_lt_inv_8_vs_lib8
+ .type kernel_strsv_lt_inv_8_vs_lib8, @function
+kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_lt_inv_8_vs_lib8
+_kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_lt_inv_8_vs_lib8
+ .def kernel_strsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $8, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 8*sda*sizeof(float)
+ addq %r12, %r11 // A+8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+8
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG8, %r12 // km
+ movq ARG9, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_lt_inv_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_lt_inv_8_vs_lib8, .-kernel_strsv_lt_inv_8_vs_lib8
+#endif
+
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_sgemv_nt_4_lib8(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_nt_4_lib8
+ .type kernel_sgemv_nt_4_lib8, @function
+kernel_sgemv_nt_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_nt_4_lib8
+_kernel_sgemv_nt_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_nt_4_lib8
+ .def kernel_sgemv_nt_4_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_nt_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_nt_4_lib8, .-kernel_sgemv_nt_4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemv_nt_4_vs_lib8(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_nt_4_vs_lib8
+ .type kernel_sgemv_nt_4_vs_lib8, @function
+kernel_sgemv_nt_4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_nt_4_vs_lib8
+_kernel_sgemv_nt_4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_nt_4_vs_lib8
+ .def kernel_sgemv_nt_4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_nt_4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+ movq ARG12, %r11 // km
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ cmpl $2, %r11d
+ jl 0f
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ cmpl $3, %r11d
+ jl 0f
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ je 0f
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+0:
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+ movq ARG12, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_nt_4_vs_lib8, .-kernel_sgemv_nt_4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_dsymv_l_4l_lib8(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssymv_l_4l_lib8
+ .type kernel_ssymv_l_4l_lib8, @function
+kernel_ssymv_l_4l_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssymv_l_4l_lib8
+_kernel_ssymv_l_4l_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssymv_l_4l_lib8
+ .def kernel_ssymv_l_4l_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4l_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x_t
+ movq ARG6, %r14 // z_n
+ movq $0, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_symv_add_nt_4l_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssymv_l_4l_lib8, .-kernel_ssymv_l_4l_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_dsymv_l_4r_lib8(int k, double *alpha, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssymv_l_4r_lib8
+ .type kernel_ssymv_l_4r_lib8, @function
+kernel_ssymv_l_4r_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssymv_l_4r_lib8
+_kernel_ssymv_l_4r_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssymv_l_4r_lib8
+ .def kernel_ssymv_l_4r_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4r_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x_t
+ movq ARG6, %r14 // z_n
+ movq $0, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_symv_add_nt_4r_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssymv_l_4r_lib8, .-kernel_ssymv_l_4r_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dsymv_l_4l_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssymv_l_4l_gen_lib8
+ .type kernel_ssymv_l_4l_gen_lib8, @function
+kernel_ssymv_l_4l_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssymv_l_4l_gen_lib8
+_kernel_ssymv_l_4l_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssymv_l_4l_gen_lib8
+ .def kernel_ssymv_l_4l_gen_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4l_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+ movq ARG8, %r11 // km
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ cmpl $2, %r11d
+ jl 0f
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ cmpl $3, %r11d
+ jl 0f
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ je 0f
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+0:
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+ movq ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_SYMV_ADD_NT_4L_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_symv_add_nt_4l_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_symv_add_nt_4l_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z_t
+ movq ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssymv_l_4l_gen_lib8, .-kernel_ssymv_l_4l_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dsymv_l_4r_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssymv_l_4r_gen_lib8
+ .type kernel_ssymv_l_4r_gen_lib8, @function
+kernel_ssymv_l_4r_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssymv_l_4r_gen_lib8
+_kernel_ssymv_l_4r_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssymv_l_4r_gen_lib8
+ .def kernel_ssymv_l_4r_gen_lib8; .scl 2; .type 32; .endef
+kernel_ssymv_l_4r_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+ movq ARG8, %r11 // km
+
+ vbroadcastss 0(%r10), %ymm6
+ vmulps %ymm15, %ymm6, %ymm6
+ cmpl $2, %r11d
+ jl 0f
+ vbroadcastss 4(%r10), %ymm7
+ vmulps %ymm15, %ymm7, %ymm7
+ cmpl $3, %r11d
+ jl 0f
+ vbroadcastss 8(%r10), %ymm8
+ vmulps %ymm15, %ymm8, %ymm8
+ je 0f
+ vbroadcastss 12(%r10), %ymm9
+ vmulps %ymm15, %ymm9, %ymm9
+0:
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+ movq ARG3, %r15 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_SYMV_ADD_NT_4R_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_symv_add_nt_4r_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_symv_add_nt_4r_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_NT_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_nt_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_nt_4_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z_t
+ movq ARG8, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssymv_l_4r_gen_lib8, .-kernel_ssymv_l_4r_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .float 0.5
+ .float 1.5
+ .float 2.5
+ .float 3.5
+ .float 4.5
+ .float 5.5
+ .float 6.5
+ .float 7.5
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
+
diff --git a/kernel/avx/kernel_sgemv_8_lib8.S b/kernel/avx/kernel_sgemv_8_lib8.S
new file mode 100644
index 0000000..aafd8cb
--- /dev/null
+++ b/kernel/avx/kernel_sgemv_8_lib8.S
@@ -0,0 +1,2837 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- x
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- x+k*sizeof(double)
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemv_add_n_8_lib8, @function
+inner_kernel_gemv_add_n_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_n_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemv_add_n_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_n_8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ subl $4, %r10d
+
+ vmovaps 32(%r11), %ymm8
+ vbroadcastss 4(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vbroadcastss 8(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vbroadcastss 12(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ addq $128, %r11
+ addq $16, %r12
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ addq $32, %r11
+ addq $4, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemv_add_n_8_lib8, .-inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemv_add_t_8_lib8, @function
+inner_kernel_gemv_add_t_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemv_add_t_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemv_add_t_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemv_add_t_8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovups 0(%r13), %ymm12
+
+ vmovaps 0(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ subl $8, %r10d
+
+ vmovaps 32(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 128(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+
+ vmovaps 160(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+
+ vmovaps 192(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+
+ vmovaps 224(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $7, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2ss %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm14
+
+ vmaskmovps 0(%r13), %ymm14, %ymm12
+
+ vmaskmovps 0(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ vmaskmovps 32(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmaskmovps 64(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmaskmovps 96(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmaskmovps 128(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+
+ vmaskmovps 160(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+
+ vmaskmovps 192(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+
+ vmaskmovps 224(%r11), %ymm14, %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ sall $2, %r10d
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemv_add_t_8_lib8, .-inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <-
+// r11 <-
+// r12 <-
+// r13 <-
+// r14d <- offA
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_GEMV_ADD_T_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemv_add_t_8_lib8, @function
+inner_edge_gemv_add_t_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemv_add_t_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemv_add_t_8_lib8; .scl 2; .type 32; .endef
+inner_edge_gemv_add_t_8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jle 0f // return
+
+ movl %r14d, %r15d
+ sall $2, %r15d // offA*sizeof(float)
+
+ subq %r15, %r11 // A - offA
+ subq %r15, %r13 // x - offA
+
+ movl %r10d, %r15d // kmax
+ addl %r14d, %r15d // kmax + offA
+
+ vcvtsi2ss %r14d, %xmm14, %xmm14 // offA
+ vcvtsi2ss %r15d, %xmm15, %xmm15 // offA + kmax
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm13, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+ vandps %ymm15, %ymm14, %ymm14
+
+ vmaskmovps 0(%r13), %ymm14, %ymm12
+
+ vmovaps 0(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+
+ vmovaps 32(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+
+ vmovaps 64(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+
+ vmovaps 96(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 128(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+
+ vmovaps 160(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+
+ vmovaps 192(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+
+ vmovaps 224(%r11), %ymm8
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+ addq $32, %r13 // x + 4
+ addq %r12, %r11 // A + bs*sda
+
+ addl %r14d, %r10d
+ subl $8, %r10d // kmax - (8-offA)
+
+0: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemv_add_t_8_lib8, .-inner_edge_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LN_INV_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_ln_inv_8_lib8, @function
+inner_edge_trsv_ln_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_ln_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_ln_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_ln_inv_8_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vbroadcastss 0(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+
+ vmovaps 0(%r10), %ymm13
+ vblendps $0x01, %ymm14, %ymm13, %ymm13
+ vpermilps $0x00, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 4(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x02, %ymm1, %ymm0, %ymm0
+
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vpermilps $0x55, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 8(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x04, %ymm1, %ymm0, %ymm0
+
+ vmovaps 64(%r10), %ymm13
+ vblendps $0x07, %ymm14, %ymm13, %ymm13
+ vpermilps $0xaa, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 12(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x08, %ymm1, %ymm0, %ymm0
+
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vpermilps $0xff, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 16(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x10, %ymm1, %ymm0, %ymm0
+
+ vmovaps 128(%r10), %ymm13
+ vblendps $0x1f, %ymm14, %ymm13, %ymm13
+ vpermilps $0x00, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 20(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x20, %ymm1, %ymm0, %ymm0
+
+ vmovaps 160(%r10), %ymm13
+ vblendps $0x3f, %ymm14, %ymm13, %ymm13
+ vpermilps $0x55, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 24(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x40, %ymm1, %ymm0, %ymm0
+
+ vmovaps 192(%r10), %ymm13
+ vblendps $0x7f, %ymm14, %ymm13, %ymm13
+ vpermilps $0xaa, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+ vbroadcastss 28(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_ln_inv_8_lib8, .-inner_edge_trsv_ln_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12d <- kn
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_ln_inv_8_vs_lib8, @function
+inner_edge_trsv_ln_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_ln_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_ln_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_ln_inv_8_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vbroadcastss 0(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x01, %ymm1, %ymm0, %ymm0
+ vmovaps 0(%r10), %ymm13
+ vblendps $0x01, %ymm14, %ymm13, %ymm13
+ vpermilps $0x00, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $2, %r12d
+ jl 0f // ret
+
+ vbroadcastss 4(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x02, %ymm1, %ymm0, %ymm0
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vpermilps $0x55, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $3, %r12d
+ jl 0f // ret
+
+ vbroadcastss 8(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x04, %ymm1, %ymm0, %ymm0
+ vmovaps 64(%r10), %ymm13
+ vblendps $0x07, %ymm14, %ymm13, %ymm13
+ vpermilps $0xaa, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $4, %r12d
+ jl 0f // ret
+
+ vbroadcastss 12(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x08, %ymm1, %ymm0, %ymm0
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vpermilps $0xff, %ymm0, %ymm12
+ vperm2f128 $0x00, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $5, %r12d
+ jl 0f // ret
+
+ vbroadcastss 16(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x10, %ymm1, %ymm0, %ymm0
+ vmovaps 128(%r10), %ymm13
+ vblendps $0x1f, %ymm14, %ymm13, %ymm13
+ vpermilps $0x00, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $6, %r12d
+ jl 0f // ret
+
+ vbroadcastss 20(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x20, %ymm1, %ymm0, %ymm0
+ vmovaps 160(%r10), %ymm13
+ vblendps $0x3f, %ymm14, %ymm13, %ymm13
+ vpermilps $0x55, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $7, %r12d
+ jl 0f // ret
+
+ vbroadcastss 24(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x40, %ymm1, %ymm0, %ymm0
+ vmovaps 192(%r10), %ymm13
+ vblendps $0x7f, %ymm14, %ymm13, %ymm13
+ vpermilps $0xaa, %ymm0, %ymm12
+ vperm2f128 $0x11, %ymm12, %ymm12, %ymm12
+ vmulps %ymm13, %ymm12, %ymm15
+ vsubps %ymm15, %ymm0, %ymm0
+
+ cmpl $8, %r12d
+ jl 0f // ret
+
+ vbroadcastss 28(%r11), %ymm12
+ vmulps %ymm0, %ymm12, %ymm1
+ vblendps $0x80, %ymm1, %ymm0, %ymm0
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_ln_inv_8_vs_lib8, .-inner_edge_trsv_ln_inv_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_lt_inv_8_lib8, @function
+inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_lib8:
+#endif
+#endif
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x01, %ymm14, %ymm12, %ymm12
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x07, %ymm14, %ymm12, %ymm12
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ vmovaps 144(%r10), %xmm12
+ vblendps $0x01, %xmm14, %xmm12, %xmm12
+ vmovaps 176(%r10), %xmm13
+ vblendps $0x03, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+ vmovaps 208(%r10), %xmm12
+ vblendps $0x07, %xmm14, %xmm12, %xmm12
+ vmovaps 240(%r10), %xmm13
+ vblendps $0x0f, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ vshufps $0xff, %xmm1, %xmm1, %xmm2
+ vbroadcastss 28(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm1, %xmm1
+ vmulps %xmm10, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm13, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0xaa, %xmm1, %xmm1, %xmm2
+ vbroadcastss 24(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm1, %xmm1
+ vmulps %xmm9, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm12, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0x55, %xmm1, %xmm1, %xmm2
+ vbroadcastss 20(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm1, %xmm1
+ vmulps %xmm8, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm11, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+ vshufps $0x00, %xmm1, %xmm1, %xmm2
+ vbroadcastss 16(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm1, %xmm1
+ vmulps %xmm7, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0xff, %xmm0, %xmm0, %xmm2
+ vbroadcastss 12(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm0, %xmm0
+ vmulps %xmm6, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0xaa, %xmm0, %xmm0, %xmm2
+ vbroadcastss 8(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm0, %xmm0
+ vmulps %xmm5, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0x55, %xmm0, %xmm0, %xmm2
+ vbroadcastss 4(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm0, %xmm0
+ vmulps %xmm4, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+ vshufps $0x00, %xmm0, %xmm0, %xmm2
+ vbroadcastss 0(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm0, %xmm0
+
+ vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_lt_inv_8_lib8, .-inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution with vector RHS
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// r13 <- kn
+// r14 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// r13 <- kn
+// r14 <- x
+// ymm0 <- [z0 z1 z2 z3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsv_lt_inv_8_vs_lib8, @function
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsv_lt_inv_8_vs_lib8:
+#endif
+#endif
+
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm13, %ymm14
+
+ vmovups 0(%r14), %ymm15
+ vblendvps %ymm14, %ymm0, %ymm15, %ymm0
+
+
+
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x01, %ymm14, %ymm12, %ymm12
+ cmpl $2, %r13d
+ jl 1f
+ vmovaps 32(%r10), %ymm13
+ vblendps $0x03, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+ cmpl $3, %r13d
+ jl 2f
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x07, %ymm14, %ymm12, %ymm12
+ cmpl $4, %r13d
+ jl 3f
+ vmovaps 96(%r10), %ymm13
+ vblendps $0x0f, %ymm14, %ymm13, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ cmpl $5, %r13d
+ jl 4f
+ vmovaps 144(%r10), %xmm12
+ vblendps $0x01, %xmm14, %xmm12, %xmm12
+ cmpl $6, %r13d
+ jl 5f
+ vmovaps 176(%r10), %xmm13
+ vblendps $0x03, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+ cmpl $7, %r13d
+ jl 6f
+ vmovaps 208(%r10), %xmm12
+ vblendps $0x07, %xmm14, %xmm12, %xmm12
+ cmpl $8, %r13d
+ jl 7f
+ vmovaps 240(%r10), %xmm13
+ vblendps $0x0f, %xmm14, %xmm13, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+ jmp 0f
+
+
+
+ vmovaps %ymm14, %ymm12
+1:
+ vmovaps %ymm14, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm8
+ vunpckhps %ymm13, %ymm12, %ymm9
+
+2:
+ vmovaps %ymm14, %ymm12
+3:
+ vmovaps %ymm14, %ymm13
+ vunpcklps %ymm13, %ymm12, %ymm10
+ vunpckhps %ymm13, %ymm12, %ymm11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm7
+ vshufps $0xee, %ymm10, %ymm8, %ymm4
+ vshufps $0x44, %ymm11, %ymm9, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vextractf128 $0x1, %ymm7, %xmm7
+ vextractf128 $0x1, %ymm4, %xmm8
+ vextractf128 $0x1, %ymm5, %xmm9
+ vextractf128 $0x1, %ymm6, %xmm10
+
+ jmp 8f
+
+4:
+ vmovaps %xmm14, %xmm12
+5:
+ vmovaps %xmm14, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm1
+ vunpckhps %xmm13, %xmm12, %xmm2
+
+6:
+ vmovaps %xmm14, %xmm12
+7:
+ vmovaps %xmm14, %xmm13
+ vunpcklps %xmm13, %xmm12, %xmm3
+ vunpckhps %xmm13, %xmm12, %xmm15
+
+ vshufps $0xee, %xmm3, %xmm1, %xmm11
+ vshufps $0x44, %xmm15, %xmm2, %xmm12
+ vshufps $0xee, %xmm15, %xmm2, %xmm13
+
+8:
+
+ vmovaps %xmm14, %xmm11
+ vmovaps %xmm14, %xmm12
+ vmovaps %xmm14, %xmm13
+
+0:
+ vxorps %ymm14, %ymm14, %ymm14
+
+ vextractf128 $0x1, %ymm0, %xmm1
+
+ cmpl $8, %r12d
+ jl 0f
+
+ vshufps $0xff, %xmm1, %xmm1, %xmm2
+ cmpl $8, %r13d
+ jl 1f
+ vbroadcastss 28(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm1, %xmm1
+1:
+ vmulps %xmm10, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm13, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $7, %r12d
+ jl 0f
+
+ vshufps $0xaa, %xmm1, %xmm1, %xmm2
+ cmpl $7, %r13d
+ jl 1f
+ vbroadcastss 24(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm1, %xmm1
+1:
+ vmulps %xmm9, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm12, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $6, %r12d
+ jl 0f
+
+ vshufps $0x55, %xmm1, %xmm1, %xmm2
+ cmpl $6, %r13d
+ jl 1f
+ vbroadcastss 20(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm1, %xmm1
+1:
+ vmulps %xmm8, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+ vmulps %xmm11, %xmm2, %xmm15
+ vsubps %xmm15, %xmm1, %xmm1
+
+0:
+ cmpl $5, %r12d
+ jl 0f
+
+ vshufps $0x00, %xmm1, %xmm1, %xmm2
+ cmpl $5, %r13d
+ jl 1f
+ vbroadcastss 16(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm1, %xmm1
+1:
+ vmulps %xmm7, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $4, %r12d
+ jl 0f
+
+ vshufps $0xff, %xmm0, %xmm0, %xmm2
+ cmpl $4, %r13d
+ jl 1f
+ vbroadcastss 12(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x08, %xmm2, %xmm0, %xmm0
+1:
+ vmulps %xmm6, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $3, %r12d
+ jl 0f
+
+ vshufps $0xaa, %xmm0, %xmm0, %xmm2
+ cmpl $3, %r13d
+ jl 1f
+ vbroadcastss 8(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x04, %xmm2, %xmm0, %xmm0
+1:
+ vmulps %xmm5, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $2, %r12d
+ jl 0f
+
+ vshufps $0x55, %xmm0, %xmm0, %xmm2
+ cmpl $2, %r13d
+ jl 1f
+ vbroadcastss 4(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x02, %xmm2, %xmm0, %xmm0
+1:
+ vmulps %xmm4, %xmm2, %xmm15
+ vsubps %xmm15, %xmm0, %xmm0
+
+0:
+ cmpl $1, %r12d
+ jl 0f
+
+ vshufps $0x00, %xmm0, %xmm0, %xmm2
+ cmpl $1, %r13d
+ jl 1f
+ vbroadcastss 0(%r11), %xmm15
+ vmulps %xmm2, %xmm15, %xmm2
+ vblendps $0x01, %xmm2, %xmm0, %xmm0
+1:
+
+0:
+
+ vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsv_lt_inv_8_vs_lib8, .-inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_8_lib8, @function
+inner_blend_n_scale_ab_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_8_lib8; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_8_lib8:
+#endif
+#endif
+
+ // reduction
+ vaddps %ymm0, %ymm1, %ymm0
+ vaddps %ymm2, %ymm3, %ymm2
+ vaddps %ymm0, %ymm2, %ymm0
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+ vmulps %ymm0, %ymm15, %ymm0
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+ vmovups 0(%r12), %ymm14
+ vmulps %ymm15, %ymm14, %ymm14
+ vaddps %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_8_lib8, .-inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z0 z1 z2 z3]_b
+// ymm2 <- [z0 z1 z2 z3]_c
+// ymm3 <- [z0 z1 z2 z3]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_m11_8_lib8, @function
+inner_blend_n_scale_m11_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_m11_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_m11_8_lib8; .scl 2; .type 32; .endef
+inner_blend_n_scale_m11_8_lib8:
+#endif
+#endif
+
+ // reduction
+ vaddps %ymm0, %ymm1, %ymm0
+ vaddps %ymm2, %ymm3, %ymm2
+ vaddps %ymm0, %ymm2, %ymm0
+
+ // beta
+ vmovups 0(%r10), %ymm14
+ vsubps %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_m11_8_lib8, .-inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_8_lib8, @function
+inner_blend_t_scale_ab_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_8_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_8_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+ vhaddps %ymm5, %ymm4, %ymm4
+ vhaddps %ymm7, %ymm6, %ymm6
+
+ vhaddps %ymm2, %ymm0, %ymm0
+ vhaddps %ymm6, %ymm4, %ymm4
+
+ vperm2f128 $0x20, %ymm4, %ymm0, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm4, %ymm0
+
+ vaddps %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+ vmulps %ymm0, %ymm15, %ymm0
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+ vmovups 0(%r12), %ymm14
+ vmulps %ymm15, %ymm14, %ymm14
+ vaddps %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_8_lib8, .-inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for alpha=-1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_m11_8_lib8, @function
+inner_blend_t_scale_m11_8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_m11_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_m11_8_lib8; .scl 2; .type 32; .endef
+inner_blend_t_scale_m11_8_lib8:
+#endif
+#endif
+
+ // reduction
+ vhaddps %ymm1, %ymm0, %ymm0
+ vhaddps %ymm3, %ymm2, %ymm2
+ vhaddps %ymm5, %ymm4, %ymm4
+ vhaddps %ymm7, %ymm6, %ymm6
+
+ vhaddps %ymm2, %ymm0, %ymm0
+ vhaddps %ymm6, %ymm4, %ymm4
+
+ vperm2f128 $0x20, %ymm4, %ymm0, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm4, %ymm0
+
+ vaddps %ymm0, %ymm1, %ymm0
+
+ // beta
+ vmovups 0(%r10), %ymm14
+ vsubps %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_m11_8_lib8, .-inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_lib8, @function
+inner_store_8_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_lib8; .scl 2; .type 32; .endef
+inner_store_8_lib8:
+#endif
+#endif
+
+ vmovups %ymm0, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_lib8, .-inner_store_8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_vs_lib8, @function
+inner_store_8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8_vs_lib8:
+#endif
+#endif
+
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm14
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm14, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_vs_lib8, .-inner_store_8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store gen
+//
+// input arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- k0 : start form (inc)
+// r12d <- k1 : up to (exc)
+// ymm0 <- [z0 z1 z2 z3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_gen_lib8, @function
+inner_store_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_gen_lib8, .-inner_store_8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_sgemv_n_8_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_n_8_lib8
+ .type kernel_sgemv_n_8_lib8, @function
+kernel_sgemv_n_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_n_8_lib8
+_kernel_sgemv_n_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_n_8_lib8
+ .def kernel_sgemv_n_8_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_n_8_lib8, .-kernel_sgemv_n_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_sgemv_n_8_vs_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_n_8_vs_lib8
+ .type kernel_sgemv_n_8_vs_lib8, @function
+kernel_sgemv_n_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_n_8_vs_lib8
+_kernel_sgemv_n_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_n_8_vs_lib8
+ .def kernel_sgemv_n_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_n_8_vs_lib8, .-kernel_sgemv_n_8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemv_n_8_gen_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int kq);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_n_8_gen_lib8
+ .type kernel_sgemv_n_8_gen_lib8, @function
+kernel_sgemv_n_8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_n_8_gen_lib8
+_kernel_sgemv_n_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_n_8_gen_lib8
+ .def kernel_sgemv_n_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_n_8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG8, %r11 // k1
+ movq ARG9, %r12 // k2
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_n_8_gen_lib8, .-kernel_sgemv_n_8_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_sgemv_t_8_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_8_lib8
+ .type kernel_sgemv_t_8_lib8, @function
+kernel_sgemv_t_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_8_lib8
+_kernel_sgemv_t_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_8_lib8
+ .def kernel_sgemv_t_8_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_8_lib8, .-kernel_sgemv_t_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemv_t_8_vs_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_8_vs_lib8
+ .type kernel_sgemv_t_8_vs_lib8, @function
+kernel_sgemv_t_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_8_vs_lib8
+_kernel_sgemv_t_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_8_vs_lib8
+ .def kernel_sgemv_t_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+ movq ARG9, %r11 // k1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_8_vs_lib8, .-kernel_sgemv_t_8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemv_t_8_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemv_t_8_gen_lib8
+ .type kernel_sgemv_t_8_gen_lib8, @function
+kernel_sgemv_t_8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemv_t_8_gen_lib8
+_kernel_sgemv_t_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemv_t_8_gen_lib8
+ .def kernel_sgemv_t_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemv_t_8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner sgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG6, %r13 // x
+ movq ARG3, %r14 // offA
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemv_add_t_8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG9, %r10 // z
+ movq ARG10, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemv_t_8_gen_lib8, .-kernel_sgemv_t_8_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_strsv_ln_inv_8_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_ln_inv_8_lib8
+ .type kernel_strsv_ln_inv_8_lib8, @function
+kernel_strsv_ln_inv_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_ln_inv_8_lib8
+_kernel_strsv_ln_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_ln_inv_8_lib8
+ .def kernel_strsv_ln_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_ln_inv_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+ movq %r11, %r13 // A+k*sizeof(double)
+
+
+ // call inner blender n
+
+ movq ARG5, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq %r13, %r10 // A+k*sizeof(double)
+ movq ARG3, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LN_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_ln_inv_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_ln_inv_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_ln_inv_8_lib8, .-kernel_strsv_ln_inv_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_strsv_ln_inv_8_vs_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_ln_inv_8_vs_lib8
+ .type kernel_strsv_ln_inv_8_vs_lib8, @function
+kernel_strsv_ln_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_ln_inv_8_vs_lib8
+_kernel_strsv_ln_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_ln_inv_8_vs_lib8
+ .def kernel_strsv_ln_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_ln_inv_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG4, %r12 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_N_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_n_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_n_8_lib8
+#endif
+#endif
+
+ movq %r11, %r13 // A+k*sizeof(double)
+
+
+ // call inner blender n
+
+ movq ARG5, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq %r13, %r10 // A+k*sizeof(double)
+ movq ARG3, %r11 // inv_diag_A
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_ln_inv_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_ln_inv_8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // z
+ movq ARG7, %r11 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_ln_inv_8_vs_lib8, .-kernel_strsv_ln_inv_8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_strsv_lt_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_lt_inv_8_lib8
+ .type kernel_strsv_lt_inv_8_lib8, @function
+kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_lt_inv_8_lib8
+_kernel_strsv_lt_inv_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_lt_inv_8_lib8
+ .def kernel_strsv_lt_inv_8_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $8, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 8*sda*sizeof(float)
+ addq %r12, %r11 // A+8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+8
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LT_INV_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_lt_inv_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_lt_inv_8_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_lt_inv_8_lib8, .-kernel_strsv_lt_inv_8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strsv_lt_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsv_lt_inv_8_vs_lib8
+ .type kernel_strsv_lt_inv_8_vs_lib8, @function
+kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsv_lt_inv_8_vs_lib8
+_kernel_strsv_lt_inv_8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsv_lt_inv_8_vs_lib8
+ .def kernel_strsv_lt_inv_8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsv_lt_inv_8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ subl $8, %r10d
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 8*sda*sizeof(float)
+ addq %r12, %r11 // A+8*sda*sizeof(float)
+ movq ARG5, %r13 // x
+ addq $32, %r13 // x+8
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMV_ADD_T_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemv_add_t_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemv_add_t_8_lib8
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG6, %r10 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_M11_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_m11_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_m11_8_lib8
+#endif
+#endif
+
+
+ // solution
+
+ movq ARG2, %r10 // A
+ movq ARG4, %r11 // inv_diag_A
+ movq ARG8, %r12 // km
+ movq ARG9, %r13 // kn
+ movq ARG5, %r14 // x
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsv_lt_inv_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsv_lt_inv_8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // z
+ movq ARG9, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsv_lt_inv_8_vs_lib8, .-kernel_strsv_lt_inv_8_vs_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .float 0.5
+ .float 1.5
+ .float 2.5
+ .float 3.5
+ .float 4.5
+ .float 5.5
+ .float 6.5
+ .float 7.5
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgesc_lib8.S b/kernel/avx/kernel_sgesc_lib8.S
new file mode 100644
index 0000000..43ff708
--- /dev/null
+++ b/kernel/avx/kernel_sgesc_lib8.S
@@ -0,0 +1,506 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- alpha
+// r12 <- A
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGESC_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgesc_8_lib8, @function
+inner_kernel_sgesc_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgesc_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgesc_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgesc_8_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm15
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r12)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, 32(%r12)
+
+ vmovaps 64(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, 64(%r12)
+ addq $128, %r12
+
+ vmovaps -32(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, -32(%r12)
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmulps %ymm15, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r12)
+ subl $1, %r10d
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgesc_8_lib8, .-inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- alpha
+// r12 <- A
+// r13d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGESC_8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgesc_8_gen_lib8, @function
+inner_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgesc_8_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgesc_8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ vbroadcastss 0(%r11), %ymm14
+
+ cmpl $3, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r12)
+ subl $4, %r10d
+
+ vmovaps 32(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 32(%r12)
+
+ vmovaps 64(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 64(%r12)
+ addq $128, %r12
+
+ vmovaps -32(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, -32(%r12)
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean-up loop
+
+ vmovaps 0(%r12), %ymm0
+ vmulps %ymm14, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm15, 0(%r12)
+ subl $1, %r10d
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean-up loop
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgesc_8_lib8, .-inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx
+// void kernel_sgesc_8_lib8(int k, float *alpha, float *A);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgesc_8_lib8
+ .type kernel_sgesc_8_lib8, @function
+kernel_sgesc_8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgesc_8_lib8
+_kernel_sgesc_8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgesc_8_lib8
+ .def kernel_sgesc_8_lib8; .scl 2; .type 32; .endef
+kernel_sgesc_8_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGESC_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgesc_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgesc_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgesc_8_lib8, .-kernel_sgesc_8_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgecp_8_gen_lib8(int k, float *alpha, float *A, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgesc_8_gen_lib8
+ .type kernel_sgesc_8_gen_lib8, @function
+kernel_sgesc_8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgesc_8_gen_lib8
+_kernel_sgesc_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgesc_8_gen_lib8
+ .def kernel_sgesc_8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgesc_8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // alpha
+ movq ARG3, %r12 // A
+ movq ARG4, %r14 // m1
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGESC_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgesc_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgesc_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgesc_8_gen_lib8, .-kernel_sgesc_8_gen_lib8
+#endif
+
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx/kernel_sgetr_lib8.S b/kernel/avx/kernel_sgetr_lib8.S
new file mode 100644
index 0000000..745c42e
--- /dev/null
+++ b/kernel/avx/kernel_sgetr_lib8.S
@@ -0,0 +1,2476 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGETR_8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgetr_8_lib8, @function
+inner_kernel_sgetr_8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgetr_8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgetr_8_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgetr_8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $7, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ subl $8, %r10d
+ addq %r12, %r11
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmovaps %ymm2, 0(%r13)
+ vmovaps %ymm3, 128(%r13)
+ vshufps $0xee, %ymm10, %ymm8, %ymm0
+ vshufps $0xee, %ymm14, %ymm12, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmovaps %ymm2, 32(%r13)
+ vmovaps %ymm3, 160(%r13)
+ vshufps $0x44, %ymm11, %ymm9, %ymm0
+ vshufps $0x44, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmovaps %ymm2, 64(%r13)
+ vmovaps %ymm3, 192(%r13)
+ vshufps $0xee, %ymm11, %ymm9, %ymm0
+ vshufps $0xee, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmovaps %ymm2, 96(%r13)
+ vmovaps %ymm3, 224(%r13)
+
+ addq $256, %r13
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ // 0
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm8
+ vmovaps %ymm8, 0(%r13)
+ cmpl $1, %r10d
+ jle 3f
+ // 1
+ vperm2f128 $0x20, %ymm3, %ymm2, %ymm8
+ vmovaps %ymm8, 32(%r13)
+ cmpl $2, %r10d
+ jle 3f
+ // 2
+ vperm2f128 $0x20, %ymm5, %ymm4, %ymm8
+ vmovaps %ymm8, 64(%r13)
+ cmpl $3, %r10d
+ jle 3f
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmovaps %ymm8, 96(%r13)
+ cmpl $4, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmovaps %ymm8, 128(%r13)
+ cmpl $5, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmovaps %ymm8, 160(%r13)
+ cmpl $6, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmovaps %ymm8, 192(%r13)
+// cmpl $7, %r10d
+// jle 3f
+ // 7
+// vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+// vmovaps %ymm8, 224(%r13)
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d // kleft*sizeof(float)
+ addq %r14, %r11 // A+kleft
+ movl %r10d, %r14d
+ sall $5, %r14d // kleft*bs*sizeof(float)
+ addq %r14, %r13
+ movl $0, %r10d
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgetr_8_lib8, .-inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_sgetr_8_gen_lib8, @function
+inner_kernel_sgetr_8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_sgetr_8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_sgetr_8_gen_lib8; .scl 2; .type 32; .endef
+inner_kernel_sgetr_8_gen_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $7, %r10d
+ jle 0f // consider clean-up
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ subl $8, %r10d
+ addq %r12, %r11
+
+ vmovupd -32(%rsp), %ymm4
+
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmaskmovps %ymm2, %ymm4, 0(%r13)
+ vmaskmovps %ymm3, %ymm4, 128(%r13)
+ vshufps $0xee, %ymm10, %ymm8, %ymm0
+ vshufps $0xee, %ymm14, %ymm12, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmaskmovps %ymm2, %ymm4, 32(%r13)
+ vmaskmovps %ymm3, %ymm4, 160(%r13)
+ vshufps $0x44, %ymm11, %ymm9, %ymm0
+ vshufps $0x44, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmaskmovps %ymm2, %ymm4, 64(%r13)
+ vmaskmovps %ymm3, %ymm4, 192(%r13)
+ vshufps $0xee, %ymm11, %ymm9, %ymm0
+ vshufps $0xee, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm2
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm3
+ vmaskmovps %ymm2, %ymm4, 96(%r13)
+ vmaskmovps %ymm3, %ymm4, 224(%r13)
+
+ addq $256, %r13
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+0: // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ vperm2f128 $0x20, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ cmpl $1, %r10d
+ jle 3f
+ // 1
+ vperm2f128 $0x20, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ cmpl $2, %r10d
+ jle 3f
+ // 2
+ vperm2f128 $0x20, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ cmpl $3, %r10d
+ jle 3f
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ cmpl $4, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 128(%r13)
+ cmpl $5, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 160(%r13)
+ cmpl $6, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 192(%r13)
+// cmpl $7, %r10d
+// jle 3f
+ // 7
+// vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+// vmaskmovps %ymm8, %ymm9, 224(%r13)
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d // kleft*sizeof(float)
+ addq %r14, %r11 // A+kleft
+ movl %r10d, %r14d
+ sall $5, %r14d // kleft*bs*sizeof(float)
+ addq %r14, %r13
+ movl $0, %r10d
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_sgetr_8_gen_lib8, .-inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+// r14d <- m1
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_0_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_0_gen_lib8, @function
+inner_edge_sgetr_8_0_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_0_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_0_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_0_gen_lib8, .-inner_edge_sgetr_8_0_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_1_gen_lib8, @function
+inner_edge_sgetr_8_1_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_1_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_1_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ vperm2f128 $0x20, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 2
+ vperm2f128 $0x20, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 128(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 160(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 192(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $224, %r13 // B+7*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_1_gen_lib8, .-inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_2_gen_lib8, @function
+inner_edge_sgetr_8_2_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_2_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_2_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ vperm2f128 $0x20, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 128(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 160(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $192, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_2_gen_lib8, .-inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_3_gen_lib8, @function
+inner_edge_sgetr_8_3_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_3_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_3_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ vperm2f128 $0x20, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 128(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $160, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_3_gen_lib8, .-inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_4_gen_lib8, @function
+inner_edge_sgetr_8_4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_4_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ // 4
+ vperm2f128 $0x31, %ymm1, %ymm0, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 96(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $128, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_4_gen_lib8, .-inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_5_gen_lib8, @function
+inner_edge_sgetr_8_5_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_5_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_5_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ // 4
+ // 5
+ vperm2f128 $0x31, %ymm3, %ymm2, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 64(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $96, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_5_gen_lib8, .-inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_6_gen_lib8, @function
+inner_edge_sgetr_8_6_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_6_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_6_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ // 4
+ // 5
+ // 6
+ vperm2f128 $0x31, %ymm5, %ymm4, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jle 3f
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 32(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $64, %r13 // B+6*bs*sizeof(float)
+
+ jmp 2f
+
+3:
+ movl %r10d, %r14d
+ sall $2, %r14d
+ addq %r14, %r11 // A+k*sizeof(float)
+ movl %r10d, %r14d
+ sall $5, %r14d
+ addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_6_gen_lib8, .-inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// subroutine
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 8*sda*sizeof(float)
+// r13 <- B
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_sgetr_8_7_gen_lib8, @function
+inner_edge_sgetr_8_7_gen_lib8:
+#elif defined(OS_MAC)
+_inner_edge_sgetr_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_sgetr_8_7_gen_lib8; .scl 2; .type 32; .endef
+inner_edge_sgetr_8_7_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+ vmovupd %ymm15, -32(%rsp) // spill mask to stack
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // common
+ vmovaps 0(%r11), %ymm0
+ vmovaps 32(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm8
+ vunpckhps %ymm1, %ymm0, %ymm9
+ vmovaps 64(%r11), %ymm0
+ vmovaps 96(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm10
+ vunpckhps %ymm1, %ymm0, %ymm11
+ vmovaps 128(%r11), %ymm0
+ vmovaps 160(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm12
+ vunpckhps %ymm1, %ymm0, %ymm13
+ vmovaps 192(%r11), %ymm0
+ vmovaps 224(%r11), %ymm1
+ vunpcklps %ymm1, %ymm0, %ymm14
+ vunpckhps %ymm1, %ymm0, %ymm15
+ vshufps $0x44, %ymm10, %ymm8, %ymm0
+ vshufps $0x44, %ymm14, %ymm12, %ymm1
+ vshufps $0xee, %ymm10, %ymm8, %ymm2
+ vshufps $0xee, %ymm14, %ymm12, %ymm3
+ vshufps $0x44, %ymm11, %ymm9, %ymm4
+ vshufps $0x44, %ymm15, %ymm13, %ymm5
+ vshufps $0xee, %ymm11, %ymm9, %ymm6
+ vshufps $0xee, %ymm15, %ymm13, %ymm7
+
+ vmovupd -32(%rsp), %ymm9
+
+ // 0
+ // 1
+ // 2
+ // 3
+ // 4
+ // 5
+ // 6
+ // 7
+ vperm2f128 $0x31, %ymm7, %ymm6, %ymm8
+ vmaskmovps %ymm8, %ymm9, 0(%r13)
+ subl $1, %r10d
+
+ addq %r12, %r11 // A+bs*sda*sizeof(float)
+ addq $32, %r13 // B+6*bs*sizeof(float)
+
+// jmp 2f
+//
+//3:
+// movl %r10d, %r14d
+// sall $2, %r14d
+// addq %r14, %r11 // A+k*sizeof(float)
+// movl %r10d, %r14d
+// sall $5, %r14d
+// addq %r14, %r13 // B+k*bs*sizeof(float)
+
+2: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_sgetr_8_7_gen_lib8, .-inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_0_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_0_lib8
+ .type kernel_sgetr_8_0_lib8, @function
+kernel_sgetr_8_0_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_0_lib8
+_kernel_sgetr_8_0_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_0_lib8
+ .def kernel_sgetr_8_0_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_0_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+ // offsetA==0: no edge
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_0_lib8, .-kernel_sgetr_8_0_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_0_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_0_gen_lib8
+ .type kernel_sgetr_8_0_gen_lib8, @function
+kernel_sgetr_8_0_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_0_gen_lib8
+_kernel_sgetr_8_0_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_0_gen_lib8
+ .def kernel_sgetr_8_0_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_0_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==0: edge to compute mask
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_0_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_0_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_0_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_0_gen_lib8, .-kernel_sgetr_8_0_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_1_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_1_lib8
+ .type kernel_sgetr_8_1_lib8, @function
+kernel_sgetr_8_1_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_1_lib8
+_kernel_sgetr_8_1_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_1_lib8
+ .def kernel_sgetr_8_1_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_1_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_1_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_1_lib8, .-kernel_sgetr_8_1_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_1_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_1_gen_lib8
+ .type kernel_sgetr_8_1_gen_lib8, @function
+kernel_sgetr_8_1_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_1_gen_lib8
+_kernel_sgetr_8_1_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_1_gen_lib8
+ .def kernel_sgetr_8_1_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_1_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_1_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_1_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_1_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_1_gen_lib8, .-kernel_sgetr_8_1_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_2_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_2_lib8
+ .type kernel_sgetr_8_2_lib8, @function
+kernel_sgetr_8_2_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_2_lib8
+_kernel_sgetr_8_2_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_2_lib8
+ .def kernel_sgetr_8_2_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_2_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_2_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_2_lib8, .-kernel_sgetr_8_2_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_2_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_2_gen_lib8
+ .type kernel_sgetr_8_2_gen_lib8, @function
+kernel_sgetr_8_2_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_2_gen_lib8
+_kernel_sgetr_8_2_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_2_gen_lib8
+ .def kernel_sgetr_8_2_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_2_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_2_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_2_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_2_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_2_gen_lib8, .-kernel_sgetr_8_2_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_3_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_3_lib8
+ .type kernel_sgetr_8_3_lib8, @function
+kernel_sgetr_8_3_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_3_lib8
+_kernel_sgetr_8_3_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_3_lib8
+ .def kernel_sgetr_8_3_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_3_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_3_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_3_lib8, .-kernel_sgetr_8_3_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_3_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_3_gen_lib8
+ .type kernel_sgetr_8_3_gen_lib8, @function
+kernel_sgetr_8_3_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_3_gen_lib8
+_kernel_sgetr_8_3_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_3_gen_lib8
+ .def kernel_sgetr_8_3_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_3_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_3_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_3_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_3_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_3_gen_lib8, .-kernel_sgetr_8_3_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_4_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_4_lib8
+ .type kernel_sgetr_8_4_lib8, @function
+kernel_sgetr_8_4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_4_lib8
+_kernel_sgetr_8_4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_4_lib8
+ .def kernel_sgetr_8_4_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_4_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_4_lib8, .-kernel_sgetr_8_4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_4_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_4_gen_lib8
+ .type kernel_sgetr_8_4_gen_lib8, @function
+kernel_sgetr_8_4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_4_gen_lib8
+_kernel_sgetr_8_4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_4_gen_lib8
+ .def kernel_sgetr_8_4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_4_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_4_gen_lib8, .-kernel_sgetr_8_4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_5_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_5_lib8
+ .type kernel_sgetr_8_5_lib8, @function
+kernel_sgetr_8_5_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_5_lib8
+_kernel_sgetr_8_5_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_5_lib8
+ .def kernel_sgetr_8_5_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_5_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_5_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_5_lib8, .-kernel_sgetr_8_5_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_5_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_5_gen_lib8
+ .type kernel_sgetr_8_5_gen_lib8, @function
+kernel_sgetr_8_5_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_5_gen_lib8
+_kernel_sgetr_8_5_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_5_gen_lib8
+ .def kernel_sgetr_8_5_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_5_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_5_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_5_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_5_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_5_gen_lib8, .-kernel_sgetr_8_5_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_6_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_6_lib8
+ .type kernel_sgetr_8_6_lib8, @function
+kernel_sgetr_8_6_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_6_lib8
+_kernel_sgetr_8_6_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_6_lib8
+ .def kernel_sgetr_8_6_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_6_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_6_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_6_lib8, .-kernel_sgetr_8_6_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_6_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_6_gen_lib8
+ .type kernel_sgetr_8_6_gen_lib8, @function
+kernel_sgetr_8_6_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_6_gen_lib8
+_kernel_sgetr_8_6_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_6_gen_lib8
+ .def kernel_sgetr_8_6_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_6_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_6_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_6_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_6_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_6_gen_lib8, .-kernel_sgetr_8_6_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx
+// void kernel_sgetr_8_7_lib8(int k, float *A, int sda, float *B);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_7_lib8
+ .type kernel_sgetr_8_7_lib8, @function
+kernel_sgetr_8_7_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_7_lib8
+_kernel_sgetr_8_7_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_7_lib8
+ .def kernel_sgetr_8_7_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_7_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq $8, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_7_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_7_lib8, .-kernel_sgetr_8_7_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_sgetr_8_7_gen_lib8(int k, float *A, int sda, float *B, int m1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgetr_8_7_gen_lib8
+ .type kernel_sgetr_8_7_gen_lib8, @function
+kernel_sgetr_8_7_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgetr_8_7_gen_lib8
+_kernel_sgetr_8_7_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgetr_8_7_gen_lib8
+ .def kernel_sgetr_8_7_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgetr_8_7_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // m1
+
+ // offsetA==1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_SGETR_8_7_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_sgetr_8_7_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_sgetr_8_7_gen_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_KERNEL_SGETR_8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_sgetr_8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_sgetr_8_gen_lib8
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgetr_8_7_gen_lib8, .-kernel_sgetr_8_7_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/Makefile b/kernel/avx2/Makefile
new file mode 100644
index 0000000..adb91c4
--- /dev/null
+++ b/kernel/avx2/Makefile
@@ -0,0 +1,48 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_8x4_lib4.o kernel_dgemm_8x8_lib4.o kernel_dgemm_12x4_lib4.o kernel_dgemv_8_lib4.o kernel_dsymv_6_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgebp_lib4.o kernel_dgelqf_4_lib4.o
+OBJS += kernel_sgemm_24x4_lib8.o kernel_sgemm_16x4_lib8.o kernel_sgemm_8x8_lib8.o kernel_sgemm_8x4_lib8.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
diff --git a/kernel/avx2/kernel_dgebp_lib4.S b/kernel/avx2/kernel_dgebp_lib4.S
new file mode 100644
index 0000000..4093b23
--- /dev/null
+++ b/kernel/avx2/kernel_dgebp_lib4.S
@@ -0,0 +1,2741 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger4_sub_12_lib4(int k, double *A, int sda, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_12r_lib4
+ .type kernel_dger4_sub_12r_lib4, @function
+kernel_dger4_sub_12r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_12r_lib4
+_kernel_dger4_sub_12r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_12r_lib4
+ .def kernel_dger4_sub_12r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_12r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmovapd 0(%r11, %r12, 1), %ymm4
+ vmovapd 32(%r11, %r12, 1), %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm6
+ vmovapd 96(%r11, %r12, 1), %ymm7
+
+ vmovapd 0(%r11, %r12, 2), %ymm8
+ vmovapd 32(%r11, %r12, 2), %ymm9
+ vmovapd 64(%r11, %r12, 2), %ymm10
+ vmovapd 96(%r11, %r12, 2), %ymm11
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm12
+ vmovapd 0(%r14, %r15, 1), %ymm13
+ vmovapd 0(%r14, %r15, 2), %ymm14
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 0(%r14)
+ vmovapd %ymm13, 0(%r14, %r15, 1)
+ vmovapd %ymm14, 0(%r14, %r15, 2)
+
+ vmovapd 32(%r14), %ymm12
+ vmovapd 32(%r14, %r15, 1), %ymm13
+ vmovapd 32(%r14, %r15, 2), %ymm14
+ vbroadcastsd 32(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 40(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 48(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 56(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 32(%r14)
+ vmovapd %ymm13, 32(%r14, %r15, 1)
+ vmovapd %ymm14, 32(%r14, %r15, 2)
+
+ vmovapd 64(%r14), %ymm12
+ vmovapd 64(%r14, %r15, 1), %ymm13
+ vmovapd 64(%r14, %r15, 2), %ymm14
+ vbroadcastsd 64(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 72(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 80(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 88(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 64(%r14)
+ vmovapd %ymm13, 64(%r14, %r15, 1)
+ vmovapd %ymm14, 64(%r14, %r15, 2)
+
+ vmovapd 96(%r14), %ymm12
+ vmovapd 96(%r14, %r15, 1), %ymm13
+ vmovapd 96(%r14, %r15, 2), %ymm14
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd -24(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd -16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, -32(%r14)
+ vmovapd %ymm13, -32(%r14, %r15, 1)
+ vmovapd %ymm14, -32(%r14, %r15, 2)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm12
+ vmovapd 0(%r14, %r15, 1), %ymm13
+ vmovapd 0(%r14, %r15, 2), %ymm14
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 8(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 0(%r14)
+ vmovapd %ymm13, 0(%r14, %r15, 1)
+ vmovapd %ymm14, 0(%r14, %r15, 2)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_12r_lib4, .-kernel_dger4_sub_12r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dger4_sub_12_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_12r_vs_lib4
+ .type kernel_dger4_sub_12r_vs_lib4, @function
+kernel_dger4_sub_12r_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_12r_vs_lib4
+_kernel_dger4_sub_12r_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_12r_vs_lib4
+ .def kernel_dger4_sub_12r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_12r_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+ movq ARG7, %rax // km
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ vcvtsi2sd %eax, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmovapd 0(%r11, %r12, 1), %ymm4
+ vmovapd 32(%r11, %r12, 1), %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm6
+ vmovapd 96(%r11, %r12, 1), %ymm7
+
+ vmaskmovpd 0(%r11, %r12, 2), %ymm15, %ymm8
+ vmaskmovpd 32(%r11, %r12, 2), %ymm15, %ymm9
+ vmaskmovpd 64(%r11, %r12, 2), %ymm15, %ymm10
+ vmaskmovpd 96(%r11, %r12, 2), %ymm15, %ymm11
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm12
+ vmovapd 0(%r14, %r15, 1), %ymm13
+ vmovapd 0(%r14, %r15, 2), %ymm14
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 0(%r14)
+ vmovapd %ymm13, 0(%r14, %r15, 1)
+ vmovapd %ymm14, 0(%r14, %r15, 2)
+
+ vmovapd 32(%r14), %ymm12
+ vmovapd 32(%r14, %r15, 1), %ymm13
+ vmovapd 32(%r14, %r15, 2), %ymm14
+ vbroadcastsd 32(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 40(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 48(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 56(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 32(%r14)
+ vmovapd %ymm13, 32(%r14, %r15, 1)
+ vmovapd %ymm14, 32(%r14, %r15, 2)
+
+ vmovapd 64(%r14), %ymm12
+ vmovapd 64(%r14, %r15, 1), %ymm13
+ vmovapd 64(%r14, %r15, 2), %ymm14
+ vbroadcastsd 64(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 72(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 80(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 88(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 64(%r14)
+ vmovapd %ymm13, 64(%r14, %r15, 1)
+ vmovapd %ymm14, 64(%r14, %r15, 2)
+
+ vmovapd 96(%r14), %ymm12
+ vmovapd 96(%r14, %r15, 1), %ymm13
+ vmovapd 96(%r14, %r15, 2), %ymm14
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd -24(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd -16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, -32(%r14)
+ vmovapd %ymm13, -32(%r14, %r15, 1)
+ vmovapd %ymm14, -32(%r14, %r15, 2)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm12
+ vmovapd 0(%r14, %r15, 1), %ymm13
+ vmovapd 0(%r14, %r15, 2), %ymm14
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm12
+ vfnmadd231pd %ymm4, %ymm15, %ymm13
+ vfnmadd231pd %ymm8, %ymm15, %ymm14
+ vbroadcastsd 8(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm12
+ vfnmadd231pd %ymm5, %ymm15, %ymm13
+ vfnmadd231pd %ymm9, %ymm15, %ymm14
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm12
+ vfnmadd231pd %ymm6, %ymm15, %ymm13
+ vfnmadd231pd %ymm10, %ymm15, %ymm14
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm12
+ vfnmadd231pd %ymm7, %ymm15, %ymm13
+ vfnmadd231pd %ymm11, %ymm15, %ymm14
+ vmovapd %ymm12, 0(%r14)
+ vmovapd %ymm13, 0(%r14, %r15, 1)
+ vmovapd %ymm14, 0(%r14, %r15, 2)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_12r_vs_lib4, .-kernel_dger4_sub_12r_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger4_sub_8_lib4(int k, double *A, int sda, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_8r_lib4
+ .type kernel_dger4_sub_8r_lib4, @function
+kernel_dger4_sub_8r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_8r_lib4
+_kernel_dger4_sub_8r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_8r_lib4
+ .def kernel_dger4_sub_8r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmovapd 0(%r11, %r12, 1), %ymm4
+ vmovapd 32(%r11, %r12, 1), %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm6
+ vmovapd 96(%r11, %r12, 1), %ymm7
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ vmovapd 32(%r14), %ymm8
+ vmovapd 32(%r14, %r15, 1), %ymm9
+ vbroadcastsd 32(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 40(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 48(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 56(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 32(%r14)
+ vmovapd %ymm9, 32(%r14, %r15, 1)
+
+ vmovapd 64(%r14), %ymm8
+ vmovapd 64(%r14, %r15, 1), %ymm9
+ vbroadcastsd 64(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 72(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 80(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 88(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 64(%r14)
+ vmovapd %ymm9, 64(%r14, %r15, 1)
+
+ vmovapd 96(%r14), %ymm8
+ vmovapd 96(%r14, %r15, 1), %ymm9
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd -24(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd -16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, -32(%r14)
+ vmovapd %ymm9, -32(%r14, %r15, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_8r_lib4, .-kernel_dger4_sub_8r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dger4_sub_8_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, int km)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_8r_vs_lib4
+ .type kernel_dger4_sub_8r_vs_lib4, @function
+kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_8r_vs_lib4
+_kernel_dger4_sub_8r_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_8r_vs_lib4
+ .def kernel_dger4_sub_8r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_8r_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // C
+ movq ARG6, %r15 // C
+ sall $5, %r15d // 4*sdc*sizeof(double)
+ movq ARG7, %rax // km
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ vcvtsi2sd %eax, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC01(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC01(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ vmaskmovpd 0(%r11, %r12, 1), %ymm15, %ymm4
+ vmaskmovpd 32(%r11, %r12, 1), %ymm15, %ymm5
+ vmaskmovpd 64(%r11, %r12, 1), %ymm15, %ymm6
+ vmaskmovpd 96(%r11, %r12, 1), %ymm15, %ymm7
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ vmovapd 32(%r14), %ymm8
+ vmovapd 32(%r14, %r15, 1), %ymm9
+ vbroadcastsd 32(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 40(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 48(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 56(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 32(%r14)
+ vmovapd %ymm9, 32(%r14, %r15, 1)
+
+ vmovapd 64(%r14), %ymm8
+ vmovapd 64(%r14, %r15, 1), %ymm9
+ vbroadcastsd 64(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 72(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 80(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 88(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 64(%r14)
+ vmovapd %ymm9, 64(%r14, %r15, 1)
+
+ vmovapd 96(%r14), %ymm8
+ vmovapd 96(%r14, %r15, 1), %ymm9
+ vbroadcastsd 96(%r13), %ymm15
+ addq $128, %r13
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd -24(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd -16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd -8(%r13), %ymm15
+ addq $128, %r14
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, -32(%r14)
+ vmovapd %ymm9, -32(%r14, %r15, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm8
+ vmovapd 0(%r14, %r15, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm8
+ vfnmadd231pd %ymm4, %ymm15, %ymm9
+ vbroadcastsd 8(%r13), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm8
+ vfnmadd231pd %ymm5, %ymm15, %ymm9
+ vbroadcastsd 16(%r13), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm8
+ vfnmadd231pd %ymm6, %ymm15, %ymm9
+ vbroadcastsd 24(%r13), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm8
+ vfnmadd231pd %ymm7, %ymm15, %ymm9
+ vmovapd %ymm8, 0(%r14)
+ vmovapd %ymm9, 0(%r14, %r15, 1)
+
+ addq $32, %r13
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_8r_vs_lib4, .-kernel_dger4_sub_8r_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger12_add_4r_lib4(int n, double *A, double *B, int sdb, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger12_add_4r_lib4
+ .type kernel_dger12_add_4r_lib4, @function
+kernel_dger12_add_4r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger12_add_4r_lib4
+_kernel_dger12_add_4r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger12_add_4r_lib4
+ .def kernel_dger12_add_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger12_add_4r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // n
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d
+ movq ARG5, %r14 // C
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $11, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+ vmovapd 128(%r14), %ymm4
+ vmovapd 160(%r14), %ymm5
+ vmovapd 192(%r14), %ymm6
+ vmovapd 224(%r14), %ymm7
+ vmovapd 256(%r14), %ymm8
+ vmovapd 288(%r14), %ymm9
+ vmovapd 320(%r14), %ymm10
+ vmovapd 352(%r14), %ymm11
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+ vmovapd %ymm4, 128(%r14)
+ vmovapd %ymm5, 160(%r14)
+ vmovapd %ymm6, 192(%r14)
+ vmovapd %ymm7, 224(%r14)
+ vmovapd %ymm8, 256(%r14)
+ vmovapd %ymm9, 288(%r14)
+ vmovapd %ymm10, 320(%r14)
+ vmovapd %ymm11, 352(%r14)
+
+ addq $384, %r12
+ addq $384, %r14
+ subl $12, %r10d
+
+ cmpl $11, %r10d
+ jg 1b // main loop
+
+2:
+ cmpl $3, %r10d
+ jle 2f // return
+
+ // cleanup loop
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+
+ addq $128, %r12
+ addq $128, %r14
+ subl $4, %r10d
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+2:
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+
+ addq $32, %r12
+ addq $32, %r14
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 1b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger12_add_4r_lib4, .-kernel_dger12_add_4r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger8_add_4r_lib4(int n, double *A, double *B, int sdb, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger8_add_4r_lib4
+ .type kernel_dger8_add_4r_lib4, @function
+kernel_dger8_add_4r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger8_add_4r_lib4
+_kernel_dger8_add_4r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger8_add_4r_lib4
+ .def kernel_dger8_add_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger8_add_4r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // n
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d
+ movq ARG5, %r14 // C
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $11, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+ vmovapd 128(%r14), %ymm4
+ vmovapd 160(%r14), %ymm5
+ vmovapd 192(%r14), %ymm6
+ vmovapd 224(%r14), %ymm7
+ vmovapd 256(%r14), %ymm8
+ vmovapd 288(%r14), %ymm9
+ vmovapd 320(%r14), %ymm10
+ vmovapd 352(%r14), %ymm11
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+ vmovapd %ymm4, 128(%r14)
+ vmovapd %ymm5, 160(%r14)
+ vmovapd %ymm6, 192(%r14)
+ vmovapd %ymm7, 224(%r14)
+ vmovapd %ymm8, 256(%r14)
+ vmovapd %ymm9, 288(%r14)
+ vmovapd %ymm10, 320(%r14)
+ vmovapd %ymm11, 352(%r14)
+
+ addq $384, %r12
+ addq $384, %r14
+ subl $12, %r10d
+
+ cmpl $11, %r10d
+ jg 1b // main loop
+
+2:
+ cmpl $3, %r10d
+ jle 2f // return
+
+ // cleanup loop
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+
+ addq $128, %r12
+ addq $128, %r14
+ subl $4, %r10d
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+2:
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+
+ addq $32, %r12
+ addq $32, %r14
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 1b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger8_add_4r_lib4, .-kernel_dger8_add_4r_lib4
+#endif
+
+
+
+
+
+#if 0
+// 1 2 3 4 5
+// void kernel_dger8_sub_4r_lib4(int n, double *A, double *B, int sdb, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger8_add_4r_lib4
+ .type kernel_dger8_add_4r_lib4, @function
+kernel_dger8_add_4r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger8_add_4r_lib4
+_kernel_dger8_add_4r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger8_add_4r_lib4
+ .def kernel_dger8_add_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger8_add_4r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+ sall $5, %r13d
+ movq ARG5, %r14
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+ vmovapd 128(%r11), %ymm4
+ vmovapd 160(%r11), %ymm5
+ vmovapd 192(%r11), %ymm6
+ vmovapd 224(%r11), %ymm7
+
+ cmpl $7, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ // 04
+ vmovapd 0(%r14), %ymm12
+ vbroadcastsd 0(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 8(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 16(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 24(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 0(%r14)
+
+ // 14
+ vmovapd 32(%r14), %ymm12
+ vbroadcastsd 32(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 40(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 48(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 56(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 32(%r14)
+
+ // 24
+ vmovapd 64(%r14), %ymm12
+ vbroadcastsd 64(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 72(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 80(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 88(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 64(%r14)
+
+ // 34
+ vmovapd 96(%r14), %ymm12
+ vbroadcastsd 96(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 104(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 112(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 120(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 96(%r14)
+
+ // 44
+ vmovapd 128(%r14), %ymm12
+ vbroadcastsd 128(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 136(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 144(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 152(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 128(%r14)
+
+ // 54
+ vmovapd 160(%r14), %ymm12
+ vbroadcastsd 160(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 168(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 176(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 184(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 160(%r14)
+
+ // 64
+ vmovapd 192(%r14), %ymm12
+ vbroadcastsd 192(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 200(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 208(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 216(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 192(%r14)
+
+ // 74
+ vmovapd 224(%r14), %ymm12
+ vbroadcastsd 224(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 232(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 240(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 248(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vmovapd %ymm12, 224(%r14)
+
+ // 08
+ vmovapd 0(%r14), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 0(%r14)
+
+ // 18
+ vmovapd 32(%r14), %ymm12
+ vbroadcastsd 32(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 40(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 48(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 56(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 32(%r14)
+
+ // 28
+ vmovapd 64(%r14), %ymm12
+ vbroadcastsd 64(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 71(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 80(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 88(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 64(%r14)
+
+ // 38
+ vmovapd 96(%r14), %ymm12
+ vbroadcastsd 96(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 104(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 112(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 120(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 96(%r14)
+
+ // 48
+ vmovapd 128(%r14), %ymm12
+ vbroadcastsd 128(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 136(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 144(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 152(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 128(%r14)
+
+ // 58
+ vmovapd 160(%r14), %ymm12
+ vbroadcastsd 160(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 168(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 176(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 184(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 160(%r14)
+
+ // 68
+ vmovapd 192(%r14), %ymm12
+ vbroadcastsd 192(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 200(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 208(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 216(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 192(%r14)
+
+ // 78
+ vmovapd 224(%r14), %ymm12
+ vbroadcastsd 224(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 232(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 240(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 248(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 224(%r14)
+
+ addq $256, %r12
+ addq $256, %r14
+ subl $8, %r10d
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r14), %ymm12
+ vbroadcastsd 0(%r12), %ymm15
+ vfmadd231pd %ymm0, %ymm15, %ymm12
+ vbroadcastsd 8(%r12), %ymm15
+ vfmadd231pd %ymm1, %ymm15, %ymm12
+ vbroadcastsd 16(%r12), %ymm15
+ vfmadd231pd %ymm2, %ymm15, %ymm12
+ vbroadcastsd 24(%r12), %ymm15
+ vfmadd231pd %ymm3, %ymm15, %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm4, %ymm15, %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm5, %ymm15, %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm6, %ymm15, %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm7, %ymm15, %ymm12
+ vmovapd %ymm12, 0(%r14)
+
+ addq $32, %r12
+ addq $32, %r14
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger8_add_4r_lib4, .-kernel_dger8_add_4r_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dger4_sub_4_lib4(int n, double *A, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_4r_lib4
+ .type kernel_dger4_sub_4r_lib4, @function
+kernel_dger4_sub_4r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_4r_lib4
+_kernel_dger4_sub_4r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_4r_lib4
+ .def kernel_dger4_sub_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+ vmovapd 64(%r11), %ymm2
+ vmovapd 96(%r11), %ymm3
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ vmovapd 32(%r13), %ymm4
+ vbroadcastsd 32(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 40(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 48(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 56(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 32(%r13)
+
+ vmovapd 64(%r13), %ymm4
+ vbroadcastsd 64(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 72(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 80(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 88(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 64(%r13)
+
+ vmovapd 96(%r13), %ymm4
+ vbroadcastsd 96(%r12), %ymm15
+ addq $128, %r12
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd -24(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd -16(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd -8(%r12), %ymm15
+ addq $128, %r13
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ addq $32, %r12
+ addq $32, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_4r_lib4, .-kernel_dger4_sub_4r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dger2_sub_4_lib4(int n, double *A, double *B, double *C)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger2_sub_4r_lib4
+ .type kernel_dger2_sub_4r_lib4, @function
+kernel_dger2_sub_4r_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger2_sub_4r_lib4
+_kernel_dger2_sub_4r_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger2_sub_4r_lib4
+ .def kernel_dger2_sub_4r_lib4; .scl 2; .type 32; .endef
+kernel_dger2_sub_4r_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // load block from A
+ vmovapd 0(%r11), %ymm0
+ vmovapd 32(%r11), %ymm1
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ vmovapd 32(%r13), %ymm4
+ vbroadcastsd 32(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 40(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vmovapd %ymm4, 32(%r13)
+
+ vmovapd 64(%r13), %ymm4
+ vbroadcastsd 64(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 72(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vmovapd %ymm4, 64(%r13)
+
+ vmovapd 96(%r13), %ymm4
+ vbroadcastsd 96(%r12), %ymm15
+ addq $128, %r12
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd -24(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ addq $128, %r13
+ vmovapd %ymm4, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ addq $32, %r12
+ addq $32, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger2_sub_4r_lib4, .-kernel_dger2_sub_4r_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dger4_sub_4_vs_lib4(int n, double *A, double *B, double *C, int km)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dger4_sub_4r_vs_lib4
+ .type kernel_dger4_sub_4r_vs_lib4, @function
+kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dger4_sub_4r_vs_lib4
+_kernel_dger4_sub_4r_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dger4_sub_4r_vs_lib4
+ .def kernel_dger4_sub_4r_vs_lib4; .scl 2; .type 32; .endef
+kernel_dger4_sub_4r_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ movq ARG4, %r13
+ movq ARG5, %r14
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC00(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC00(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ // load block from A
+ vmaskmovpd 0(%r11), %ymm15, %ymm0
+ vmaskmovpd 32(%r11), %ymm15, %ymm1
+ vmaskmovpd 64(%r11), %ymm15, %ymm2
+ vmaskmovpd 96(%r11), %ymm15, %ymm3
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ subl $4, %r10d
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ vmovapd 32(%r13), %ymm4
+ vbroadcastsd 32(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 40(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 48(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 56(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 32(%r13)
+
+ vmovapd 64(%r13), %ymm4
+ vbroadcastsd 64(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 72(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 80(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 88(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 64(%r13)
+
+ vmovapd 96(%r13), %ymm4
+ vbroadcastsd 96(%r12), %ymm15
+ addq $128, %r12
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd -24(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd -16(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd -8(%r12), %ymm15
+ addq $128, %r13
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, -32(%r13)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r13), %ymm4
+ vbroadcastsd 0(%r12), %ymm15
+ vfnmadd231pd %ymm0, %ymm15, %ymm4
+ vbroadcastsd 8(%r12), %ymm15
+ vfnmadd231pd %ymm1, %ymm15, %ymm4
+ vbroadcastsd 16(%r12), %ymm15
+ vfnmadd231pd %ymm2, %ymm15, %ymm4
+ vbroadcastsd 24(%r12), %ymm15
+ vfnmadd231pd %ymm3, %ymm15, %ymm4
+ vmovapd %ymm4, 0(%r13)
+
+ addq $32, %r12
+ addq $32, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dger4_sub_4r_vs_lib4, .-kernel_dger4_sub_4r_vs_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00:
+#elif defined(OS_MAC)
+LC00:
+ .align 5
+#endif
+ .double 0.5
+ .double 1.5
+ .double 2.5
+ .double 3.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01:
+#elif defined(OS_MAC)
+LC01:
+ .align 5
+#endif
+ .double 4.5
+ .double 5.5
+ .double 6.5
+ .double 7.5
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02:
+#elif defined(OS_MAC)
+LC02:
+ .align 5
+#endif
+ .double 8.5
+ .double 9.5
+ .double 10.5
+ .double 11.5
+
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgelqf_4_lib4.S b/kernel/avx2/kernel_dgelqf_4_lib4.S
new file mode 100644
index 0000000..2f8b1be
--- /dev/null
+++ b/kernel/avx2/kernel_dgelqf_4_lib4.S
@@ -0,0 +1,5728 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dgelqf_dlarft12_12_lib4(int n, double *pD, int sdd, double *dD, double *pT)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgelqf_dlarft12_12_lib4
+ .type kernel_dgelqf_dlarft12_12_lib4, @function
+kernel_dgelqf_dlarft12_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgelqf_dlarft12_12_lib4
+_kernel_dgelqf_dlarft12_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgelqf_dlarft12_12_lib4
+ .def kernel_dgelqf_dlarft12_12_lib4; .scl 2; .type 32; .endef
+kernel_dgelqf_dlarft12_12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero T
+
+ movq ARG5, %r10 // T
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm15, 0(%r10)
+ vmovapd %ymm15, 32(%r10)
+ vmovapd %ymm15, 64(%r10)
+ vmovapd %ymm15, 96(%r10)
+
+ // first column
+
+ movq ARG2, %r11 // D
+ movq ARG3, %r14 // sdd
+ sall $5, %r14d
+ movq ARG4, %r12 // dD
+ movq ARG5, %r13 // T
+ movq $384, %r15 // sdt !!!!!!!!!!!!!!!!!!!!!!!!!
+
+ vxorpd %xmm15, %xmm15, %xmm15
+ movq ARG1, %r10 // n
+ subl $1, %r10d
+ addq $32, %r11
+100:
+ vmovsd 0(%r11), %xmm14
+ vfmadd231sd %xmm14, %xmm14, %xmm15
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 100b
+
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 101f
+ vmovsd %xmm14, 0(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 0(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 0(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 0(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 0(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r14, 1), %ymm1
+ vmovapd 0(%r11, %r14, 2), %ymm2
+ vbroadcastsd 32(%r11), %ymm8
+ vbroadcastsd 64(%r11), %ymm9
+ vbroadcastsd 96(%r11), %ymm10
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 32(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm8, 32(%r11)
+ vmovsd %xmm9, 64(%r11)
+ vmovsd %xmm10, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 0(%r11), %ymm8
+ vbroadcastsd 32(%r11), %ymm9
+ vbroadcastsd 64(%r11), %ymm10
+ vbroadcastsd 96(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 0(%r11)
+ vmovsd %xmm9, 32(%r11)
+ vmovsd %xmm10, 64(%r11)
+ vmovsd %xmm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 0(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm15, %ymm2, %ymm2
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0x55, %ymm15, %ymm15 // beta
+
+ // second column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 8(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 40(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 40(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 8(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 40(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 32(%r11), %ymm0
+ vmovapd 32(%r11, %r14, 1), %ymm1
+ vmovapd 32(%r11, %r14, 2), %ymm2
+ vbroadcastsd 72(%r11), %ymm9
+ vbroadcastsd 104(%r11), %ymm10
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm9, 72(%r11)
+ vmovsd %xmm10, 104(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 8(%r11), %ymm8
+ vbroadcastsd 40(%r11), %ymm9
+ vbroadcastsd 72(%r11), %ymm10
+ vbroadcastsd 104(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 8(%r11)
+ vmovsd %xmm9, 40(%r11)
+ vmovsd %xmm10, 72(%r11)
+ vmovsd %xmm11, 104(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 8(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 8(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC02(%rip), %ymm12
+#else
+ vmovapd LC02(%rip), %ymm12
+#endif
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm15, %ymm2, %ymm2
+ vmovsd %xmm0, 32(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x3, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 40(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xaa, %ymm15, %ymm15 // beta
+
+ // third column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 16(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 80(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 80(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 16(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 80(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 64(%r11), %ymm0
+ vmovapd 64(%r11, %r14, 1), %ymm1
+ vmovapd 64(%r11, %r14, 2), %ymm2
+ vbroadcastsd 112(%r11), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm10, 112(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 16(%r11), %ymm8
+ vbroadcastsd 48(%r11), %ymm9
+ vbroadcastsd 80(%r11), %ymm10
+ vbroadcastsd 112(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 16(%r11)
+ vmovsd %xmm9, 48(%r11)
+ vmovsd %xmm10, 80(%r11)
+ vmovsd %xmm11, 112(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 16(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 16(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vblendpd $0x7, %ymm15, %ymm0, %ymm0
+ vbroadcastsd 80(%r13), %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd %xmm0, 64(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x7, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 48(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 80(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xff, %ymm15, %ymm15 // beta
+
+ // fourth column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 24(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 120(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 120(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 24(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 120(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 96(%r11), %ymm0
+ vmovapd 96(%r11, %r14, 1), %ymm1
+ vmovapd 96(%r11, %r14, 2), %ymm2
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 24(%r11), %ymm8
+ vbroadcastsd 56(%r11), %ymm9
+ vbroadcastsd 88(%r11), %ymm10
+ vbroadcastsd 120(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 24(%r11)
+ vmovsd %xmm9, 56(%r11)
+ vmovsd %xmm10, 88(%r11)
+ vmovsd %xmm11, 120(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 24(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 24(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vbroadcastsd 120(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 96(%r13), %ymm0
+ vblendpd $0x7, %ymm15, %ymm0, %ymm0
+ vmovapd %ymm0, 96(%r13)
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 56(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 88(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 120(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 56(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 88(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 120(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+// vpermpd $0x00, %ymm15, %ymm15 // beta
+
+ // fifth column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 32(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $128, %r11
+ vmovsd 0(%r11, %r14, 1), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 0(%r11, %r14, 1) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 32(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 128(%r13, %r15, 1) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r14, 1), %ymm1
+ vmovapd 0(%r11, %r14, 2), %ymm2
+ vbroadcastsd 32(%r11, %r14, 1), %ymm8
+ vbroadcastsd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 96(%r11, %r14, 1), %ymm10
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 32(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm8, 32(%r11, %r14, 1)
+ vmovsd %xmm9, 64(%r11, %r14, 1)
+ vmovsd %xmm10, 96(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $8, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 0(%r11, %r14, 1), %ymm8
+ vbroadcastsd 32(%r11, %r14, 1), %ymm9
+ vbroadcastsd 64(%r11, %r14, 1), %ymm10
+ vbroadcastsd 96(%r11, %r14, 1), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 0(%r11, %r14, 1)
+ vmovsd %xmm9, 32(%r11, %r14, 1)
+ vmovsd %xmm10, 64(%r11, %r14, 1)
+ vmovsd %xmm11, 96(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 0(%r11, %r14, 1), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 0(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vmovapd 96(%r13), %ymm14
+ vpermpd $0xff, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vbroadcastsd 128(%r13, %r15, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+// vmovapd 128(%r13), %ymm0
+// vblendpd $0xf, %ymm15, %ymm0, %ymm15
+ vmovapd %ymm15, 128(%r13)
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm1, %ymm1
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0x55, %ymm15, %ymm15 // beta
+
+ // sixth column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $128, %r11
+ vmovsd 40(%r11, %r14, 1), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 40(%r11, %r14, 1) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 40(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 168(%r13, %r15, 1) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 32(%r11), %ymm0
+ vmovapd 32(%r11, %r14, 1), %ymm1
+ vmovapd 32(%r11, %r14, 2), %ymm2
+ vbroadcastsd 72(%r11, %r14, 1), %ymm9
+ vbroadcastsd 104(%r11, %r14, 1), %ymm10
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm9, 72(%r11, %r14, 1)
+ vmovsd %xmm10, 104(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $8, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 8(%r11, %r14, 1), %ymm8
+ vbroadcastsd 40(%r11, %r14, 1), %ymm9
+ vbroadcastsd 72(%r11, %r14, 1), %ymm10
+ vbroadcastsd 104(%r11, %r14, 1), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 8(%r11, %r14, 1)
+ vmovsd %xmm9, 40(%r11, %r14, 1)
+ vmovsd %xmm10, 72(%r11, %r14, 1)
+ vmovsd %xmm11, 104(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 8(%r11, %r14, 1), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 8(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vmovapd 96(%r13), %ymm14
+ vpermpd $0xff, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vmovapd 128(%r13), %ymm14
+ vmovapd 128(%r13, %r15, 1), %ymm11
+ vblendpd $0x1, %ymm11, %ymm12, %ymm11
+ vpermpd $0x00, %ymm1, %ymm13 // vv
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmulpd %ymm11, %ymm13, %ymm11
+ //
+ vbroadcastsd 168(%r13, %r15, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 160(%r13, %r15, 1), %ymm0
+ vblendpd $0x1, %ymm11, %ymm0, %ymm11
+ vmovapd %ymm15, 160(%r13)
+ vmovapd %ymm11, 160(%r13, %r15, 1)
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm1, %ymm1
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 40(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xaa, %ymm15, %ymm15 // beta
+
+ // seventh column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $128, %r11
+ vmovsd 80(%r11, %r14, 1), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 80(%r11, %r14, 1) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 48(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 208(%r13, %r15, 1) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 64(%r11), %ymm0
+ vmovapd 64(%r11, %r14, 1), %ymm1
+ vmovapd 64(%r11, %r14, 2), %ymm2
+ vbroadcastsd 112(%r11, %r14, 1), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm10, 112(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $8, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 16(%r11, %r14, 1), %ymm8
+ vbroadcastsd 48(%r11, %r14, 1), %ymm9
+ vbroadcastsd 80(%r11, %r14, 1), %ymm10
+ vbroadcastsd 112(%r11, %r14, 1), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 16(%r11, %r14, 1)
+ vmovsd %xmm9, 48(%r11, %r14, 1)
+ vmovsd %xmm10, 80(%r11, %r14, 1)
+ vmovsd %xmm11, 112(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 16(%r11, %r14, 1), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 16(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+// vpermpd $0x00, %ymm0, %ymm13
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vpermpd $0x55, %ymm0, %ymm13
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xaa, %ymm0, %ymm13
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xff, %ymm0, %ymm13
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0x00, %ymm1, %ymm13 // vv
+ vmovapd 128(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 128(%r13, %r15, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x55, %ymm1, %ymm13 // vv
+ vmovapd 160(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 160(%r13, %r15, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vbroadcastsd 208(%r13, %r15, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 192(%r13, %r15, 1), %ymm0
+ vblendpd $0x3, %ymm11, %ymm0, %ymm11
+ vmovapd %ymm15, 192(%r13)
+ vmovapd %ymm11, 192(%r13, %r15, 1)
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm1, %ymm1
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 48(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 80(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xff, %ymm15, %ymm15 // beta
+
+ // eight column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $128, %r11
+ vmovsd 120(%r11, %r14, 1), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 120(%r11, %r14, 1) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 56(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 248(%r13, %r15, 1) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 96(%r11), %ymm0
+ vmovapd 96(%r11, %r14, 1), %ymm1
+ vmovapd 96(%r11, %r14, 2), %ymm2
+ movq ARG1, %r10 // n
+ subl $8, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 24(%r11, %r14, 1), %ymm8
+ vbroadcastsd 56(%r11, %r14, 1), %ymm9
+ vbroadcastsd 88(%r11, %r14, 1), %ymm10
+ vbroadcastsd 120(%r11, %r14, 1), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 24(%r11, %r14, 1)
+ vmovsd %xmm9, 56(%r11, %r14, 1)
+ vmovsd %xmm10, 88(%r11, %r14, 1)
+ vmovsd %xmm11, 120(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 24(%r11, %r14, 1), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 24(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+// vpermpd $0x00, %ymm0, %ymm13
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vpermpd $0x55, %ymm0, %ymm13
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xaa, %ymm0, %ymm13
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xff, %ymm0, %ymm13
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0x00, %ymm1, %ymm13
+ vmovapd 128(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 128(%r13, %r15, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x55, %ymm1, %ymm13
+ vmovapd 160(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 160(%r13, %r15, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xaa, %ymm1, %ymm13
+ vmovapd 192(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 192(%r13, %r15, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vbroadcastsd 248(%r13, %r15, 1), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+// vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 224(%r13, %r15, 1), %ymm0
+ vblendpd $0x7, %ymm11, %ymm0, %ymm11
+ vmovapd %ymm15, 224(%r13)
+ vmovapd %ymm11, 224(%r13, %r15, 1)
+
+// vxorpd %ymm15, %ymm15, %ymm15
+// vblendpd $0xf, %ymm15, %ymm1, %ymm1
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 56(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 88(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 120(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 56(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 88(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 120(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11, %r14, 1), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+// vpermpd $0x00, %ymm15, %ymm15 // beta
+
+ // ninth column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $256, %r11
+ vmovsd 0(%r11, %r14, 2), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 0(%r11, %r14, 2) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 64(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 256(%r13, %r15, 2) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r14, 1), %ymm1
+ vmovapd 0(%r11, %r14, 2), %ymm2
+ vbroadcastsd 32(%r11, %r14, 2), %ymm8
+ vbroadcastsd 64(%r11, %r14, 2), %ymm9
+ vbroadcastsd 96(%r11, %r14, 2), %ymm10
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 32(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm8, 32(%r11, %r14, 2)
+ vmovsd %xmm9, 64(%r11, %r14, 2)
+ vmovsd %xmm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $12, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 0(%r11, %r14, 2), %ymm8
+ vbroadcastsd 32(%r11, %r14, 2), %ymm9
+ vbroadcastsd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11, %r14, 2), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 0(%r11, %r14, 2)
+ vmovsd %xmm9, 32(%r11, %r14, 2)
+ vmovsd %xmm10, 64(%r11, %r14, 2)
+ vmovsd %xmm11, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 0(%r11, %r14, 2), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+// vpermpd $0x00, %ymm0, %ymm13
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vpermpd $0x55, %ymm0, %ymm13
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xaa, %ymm0, %ymm13
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xff, %ymm0, %ymm13
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0x00, %ymm1, %ymm13
+ vmovapd 128(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 128(%r13, %r15, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x55, %ymm1, %ymm13
+ vmovapd 160(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 160(%r13, %r15, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xaa, %ymm1, %ymm13
+ vmovapd 192(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 192(%r13, %r15, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xff, %ymm1, %ymm13
+ vmovapd 224(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 224(%r13, %r15, 1), %ymm14
+// vblendpd $0xf, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vbroadcastsd 256(%r13, %r15, 2), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+// vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+// vmovapd 224(%r13, %r15, 1), %ymm0
+// vblendpd $0xf, %ymm11, %ymm0, %ymm11
+ vmovapd %ymm15, 256(%r13)
+ vmovapd %ymm11, 256(%r13, %r15, 1)
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm2, %ymm2
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0x55, %ymm15, %ymm15 // beta
+
+ // tenth column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $256, %r11
+ vmovsd 40(%r11, %r14, 2), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 40(%r11, %r14, 2) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 72(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 296(%r13, %r15, 2) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 32(%r11), %ymm0
+ vmovapd 32(%r11, %r14, 1), %ymm1
+ vmovapd 32(%r11, %r14, 2), %ymm2
+ vbroadcastsd 72(%r11, %r14, 2), %ymm9
+ vbroadcastsd 104(%r11, %r14, 2), %ymm10
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm9, 72(%r11, %r14, 2)
+ vmovsd %xmm10, 104(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $12, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 8(%r11, %r14, 2), %ymm8
+ vbroadcastsd 40(%r11, %r14, 2), %ymm9
+ vbroadcastsd 72(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11, %r14, 2), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 8(%r11, %r14, 2)
+ vmovsd %xmm9, 40(%r11, %r14, 2)
+ vmovsd %xmm10, 72(%r11, %r14, 2)
+ vmovsd %xmm11, 104(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 8(%r11, %r14, 2), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 8(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+// vpermpd $0x00, %ymm0, %ymm13
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vpermpd $0x55, %ymm0, %ymm13
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xaa, %ymm0, %ymm13
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xff, %ymm0, %ymm13
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0x00, %ymm1, %ymm13
+ vmovapd 128(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 128(%r13, %r15, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x55, %ymm1, %ymm13
+ vmovapd 160(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 160(%r13, %r15, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xaa, %ymm1, %ymm13
+ vmovapd 192(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 192(%r13, %r15, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xff, %ymm1, %ymm13
+ vmovapd 224(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 224(%r13, %r15, 1), %ymm14
+// vblendpd $0xf, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x00, %ymm2, %ymm13
+ vmovapd 256(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 256(%r13, %r15, 1), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ vmovapd 256(%r13, %r15, 2), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm10
+ //
+ vbroadcastsd 296(%r13, %r15, 2), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+ vmulpd %ymm14, %ymm10, %ymm10
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 288(%r13, %r15, 2), %ymm0
+ vblendpd $0x1, %ymm10, %ymm0, %ymm10
+ vmovapd %ymm15, 288(%r13)
+ vmovapd %ymm11, 288(%r13, %r15, 1)
+ vmovapd %ymm10, 288(%r13, %r15, 2)
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm2, %ymm2
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 40(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xaa, %ymm15, %ymm15 // beta
+
+ // eleventh column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $256, %r11
+ vmovsd 80(%r11, %r14, 2), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 80(%r11, %r14, 2) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 80(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 336(%r13, %r15, 2) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 64(%r11), %ymm0
+ vmovapd 64(%r11, %r14, 1), %ymm1
+ vmovapd 64(%r11, %r14, 2), %ymm2
+ vbroadcastsd 112(%r11, %r14, 2), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm10, 112(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $12, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 16(%r11, %r14, 2), %ymm8
+ vbroadcastsd 48(%r11, %r14, 2), %ymm9
+ vbroadcastsd 80(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11, %r14, 2), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 16(%r11, %r14, 2)
+ vmovsd %xmm9, 48(%r11, %r14, 2)
+ vmovsd %xmm10, 80(%r11, %r14, 2)
+ vmovsd %xmm11, 112(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 16(%r11, %r14, 2), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 16(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+// vpermpd $0x00, %ymm0, %ymm13
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vpermpd $0x55, %ymm0, %ymm13
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xaa, %ymm0, %ymm13
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xff, %ymm0, %ymm13
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0x00, %ymm1, %ymm13
+ vmovapd 128(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 128(%r13, %r15, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x55, %ymm1, %ymm13
+ vmovapd 160(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 160(%r13, %r15, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xaa, %ymm1, %ymm13
+ vmovapd 192(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 192(%r13, %r15, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xff, %ymm1, %ymm13
+ vmovapd 224(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 224(%r13, %r15, 1), %ymm14
+// vblendpd $0xf, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x00, %ymm2, %ymm13
+ vmovapd 256(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 256(%r13, %r15, 1), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ vmovapd 256(%r13, %r15, 2), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm10
+ //
+ vpermpd $0x55, %ymm2, %ymm13
+ vmovapd 288(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 288(%r13, %r15, 1), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ vmovapd 288(%r13, %r15, 2), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm10
+ //
+ vbroadcastsd 336(%r13, %r15, 2), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+ vmulpd %ymm14, %ymm10, %ymm10
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 320(%r13, %r15, 2), %ymm0
+ vblendpd $0x3, %ymm10, %ymm0, %ymm10
+ vmovapd %ymm15, 320(%r13)
+ vmovapd %ymm11, 320(%r13, %r15, 1)
+ vmovapd %ymm10, 320(%r13, %r15, 2)
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm2, %ymm2
+
+ movq ARG2, %r11 // D
+ //
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 48(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 80(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11, %r14, 2), %ymm14
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xff, %ymm15, %ymm15 // beta
+
+ // twelveth
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 40(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ addq $256, %r11
+ vmovsd 120(%r11, %r14, 2), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 120(%r11, %r14, 2) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 88(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 376(%r13, %r15, 2) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 96(%r11), %ymm0
+ vmovapd 96(%r11, %r14, 1), %ymm1
+ vmovapd 96(%r11, %r14, 2), %ymm2
+ movq ARG1, %r10 // n
+ subl $12, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 24(%r11, %r14, 2), %ymm8
+ vbroadcastsd 56(%r11, %r14, 2), %ymm9
+ vbroadcastsd 88(%r11, %r14, 2), %ymm10
+ vbroadcastsd 120(%r11, %r14, 2), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 24(%r11, %r14, 2)
+ vmovsd %xmm9, 56(%r11, %r14, 2)
+ vmovsd %xmm10, 88(%r11, %r14, 2)
+ vmovsd %xmm11, 120(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 24(%r11, %r14, 2), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 24(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ //
+// vpermpd $0x00, %ymm0, %ymm13
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ //
+ vpermpd $0x55, %ymm0, %ymm13
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xaa, %ymm0, %ymm13
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0xff, %ymm0, %ymm13
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ //
+ vpermpd $0x00, %ymm1, %ymm13
+ vmovapd 128(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 128(%r13, %r15, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x55, %ymm1, %ymm13
+ vmovapd 160(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 160(%r13, %r15, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xaa, %ymm1, %ymm13
+ vmovapd 192(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 192(%r13, %r15, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0xff, %ymm1, %ymm13
+ vmovapd 224(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 224(%r13, %r15, 1), %ymm14
+// vblendpd $0xf, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ //
+ vpermpd $0x00, %ymm2, %ymm13
+ vmovapd 256(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 256(%r13, %r15, 1), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ vmovapd 256(%r13, %r15, 2), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm13, %ymm10
+ //
+ vpermpd $0x55, %ymm2, %ymm13
+ vmovapd 288(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 288(%r13, %r15, 1), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ vmovapd 288(%r13, %r15, 2), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm10
+ //
+ vpermpd $0xaa, %ymm2, %ymm13
+ vmovapd 320(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vmovapd 320(%r13, %r15, 1), %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm11
+ vmovapd 320(%r13, %r15, 2), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vfmadd231pd %ymm14, %ymm13, %ymm10
+ //
+ vbroadcastsd 376(%r13, %r15, 2), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm11, %ymm11
+ vmulpd %ymm14, %ymm10, %ymm10
+// vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 352(%r13, %r15, 2), %ymm0
+ vblendpd $0x7, %ymm10, %ymm0, %ymm10
+ vmovapd %ymm15, 352(%r13)
+ vmovapd %ymm11, 352(%r13, %r15, 1)
+ vmovapd %ymm10, 352(%r13, %r15, 2)
+
+102:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgelqf_dlarft12_12_lib4, .-kernel_dgelqf_dlarft12_12_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dgelqf_dlarft4_12_lib4(int n, double *pD, int sdd, double *dD, double *pT)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgelqf_dlarft4_12_lib4
+ .type kernel_dgelqf_dlarft4_12_lib4, @function
+kernel_dgelqf_dlarft4_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgelqf_dlarft4_12_lib4
+_kernel_dgelqf_dlarft4_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgelqf_dlarft4_12_lib4
+ .def kernel_dgelqf_dlarft4_12_lib4; .scl 2; .type 32; .endef
+kernel_dgelqf_dlarft4_12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero T
+
+ movq ARG5, %r10 // T
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm15, 0(%r10)
+ vmovapd %ymm15, 32(%r10)
+ vmovapd %ymm15, 64(%r10)
+ vmovapd %ymm15, 96(%r10)
+
+ // first column
+
+ movq ARG2, %r11 // D
+ movq ARG3, %r14 // sdd
+ sall $5, %r14d
+ movq ARG4, %r12 // dD
+ movq ARG5, %r13 // T
+
+ vxorpd %xmm15, %xmm15, %xmm15
+ movq ARG1, %r10 // n
+ subl $1, %r10d
+ addq $32, %r11
+100:
+ vmovsd 0(%r11), %xmm14
+ vfmadd231sd %xmm14, %xmm14, %xmm15
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 100b
+
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 101f
+ vmovsd %xmm14, 0(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 0(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 0(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 0(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 0(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r14, 1), %ymm1
+ vmovapd 0(%r11, %r14, 2), %ymm2
+ vbroadcastsd 32(%r11), %ymm8
+ vbroadcastsd 64(%r11), %ymm9
+ vbroadcastsd 96(%r11), %ymm10
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 32(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm8, 32(%r11)
+ vmovsd %xmm9, 64(%r11)
+ vmovsd %xmm10, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 0(%r11), %ymm8
+ vbroadcastsd 32(%r11), %ymm9
+ vbroadcastsd 64(%r11), %ymm10
+ vbroadcastsd 96(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 0(%r11)
+ vmovsd %xmm9, 32(%r11)
+ vmovsd %xmm10, 64(%r11)
+ vmovsd %xmm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 0(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm15, %ymm2, %ymm2
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0x55, %ymm15, %ymm15 // beta
+
+ // second column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 8(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 40(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 40(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 8(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 40(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 32(%r11), %ymm0
+ vmovapd 32(%r11, %r14, 1), %ymm1
+ vmovapd 32(%r11, %r14, 2), %ymm2
+ vbroadcastsd 72(%r11), %ymm9
+ vbroadcastsd 104(%r11), %ymm10
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm9, 72(%r11)
+ vmovsd %xmm10, 104(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 8(%r11), %ymm8
+ vbroadcastsd 40(%r11), %ymm9
+ vbroadcastsd 72(%r11), %ymm10
+ vbroadcastsd 104(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 8(%r11)
+ vmovsd %xmm9, 40(%r11)
+ vmovsd %xmm10, 72(%r11)
+ vmovsd %xmm11, 104(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 8(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 8(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC02(%rip), %ymm12
+#else
+ vmovapd LC02(%rip), %ymm12
+#endif
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm15, %ymm1, %ymm1
+ vmulpd %ymm15, %ymm2, %ymm2
+ vmovsd %xmm0, 32(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x3, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmulpd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 40(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xaa, %ymm15, %ymm15 // beta
+
+ // third column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 16(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 80(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 80(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 16(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 80(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 64(%r11), %ymm0
+ vmovapd 64(%r11, %r14, 1), %ymm1
+ vmovapd 64(%r11, %r14, 2), %ymm2
+ vbroadcastsd 112(%r11), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm10, %ymm2
+ vmovsd %xmm10, 112(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 16(%r11), %ymm8
+ vbroadcastsd 48(%r11), %ymm9
+ vbroadcastsd 80(%r11), %ymm10
+ vbroadcastsd 112(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 16(%r11)
+ vmovsd %xmm9, 48(%r11)
+ vmovsd %xmm10, 80(%r11)
+ vmovsd %xmm11, 112(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 16(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 16(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vblendpd $0x7, %ymm15, %ymm0, %ymm0
+ vbroadcastsd 80(%r13), %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd %xmm0, 64(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x7, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 48(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 80(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xff, %ymm15, %ymm15 // beta
+
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 24(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 120(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 120(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 24(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 120(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 96(%r11), %ymm0
+ vmovapd 96(%r11, %r14, 1), %ymm1
+ vmovapd 96(%r11, %r14, 2), %ymm2
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 24(%r11), %ymm8
+ vbroadcastsd 56(%r11), %ymm9
+ vbroadcastsd 88(%r11), %ymm10
+ vbroadcastsd 120(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 32(%r11, %r14, 2), %ymm9, %ymm2
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 64(%r11, %r14, 2), %ymm10, %ymm2
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vfmadd231pd 96(%r11, %r14, 2), %ymm11, %ymm2
+ vmovsd %xmm8, 24(%r11)
+ vmovsd %xmm9, 56(%r11)
+ vmovsd %xmm10, 88(%r11)
+ vmovsd %xmm11, 120(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 24(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 0(%r11, %r14, 2), %ymm8, %ymm2
+ vmovsd %xmm8, 24(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+
+ vbroadcastsd 120(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmulpd %ymm14, %ymm2, %ymm2
+ vmovapd 96(%r13), %ymm0
+ vblendpd $0x7, %ymm15, %ymm0, %ymm0
+ vmovapd %ymm0, 96(%r13)
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vaddpd %ymm1, %ymm9, %ymm9
+ vaddpd %ymm2, %ymm10, %ymm10
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vmovapd 32(%r11, %r14, 2), %ymm10
+ vbroadcastsd 56(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ vmovapd %ymm10, 32(%r11, %r14, 2)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vmovapd 64(%r11, %r14, 2), %ymm10
+ vbroadcastsd 88(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ vmovapd %ymm10, 64(%r11, %r14, 2)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vmovapd 96(%r11, %r14, 2), %ymm10
+ vbroadcastsd 120(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ vmovapd %ymm10, 96(%r11, %r14, 2)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vmovapd 0(%r11, %r14, 2), %ymm10
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm2, %ymm14, %ymm10
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ vmovapd %ymm10, 0(%r11, %r14, 2)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+
+102:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgelqf_dlarft4_12_lib4, .-kernel_dgelqf_dlarft4_12_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dgelqf_dlarft4_8_lib4(int n, double *pD, int sdd, double *dD, double *pT)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgelqf_dlarft4_8_lib4
+ .type kernel_dgelqf_dlarft4_8_lib4, @function
+kernel_dgelqf_dlarft4_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgelqf_dlarft4_8_lib4
+_kernel_dgelqf_dlarft4_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgelqf_dlarft4_8_lib4
+ .def kernel_dgelqf_dlarft4_8_lib4; .scl 2; .type 32; .endef
+kernel_dgelqf_dlarft4_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero T
+
+ movq ARG5, %r10 // T
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm15, 0(%r10)
+ vmovapd %ymm15, 32(%r10)
+ vmovapd %ymm15, 64(%r10)
+ vmovapd %ymm15, 96(%r10)
+
+ // first column
+
+ movq ARG2, %r11 // D
+ movq ARG3, %r14 // sdd
+ sall $5, %r14d
+ movq ARG4, %r12 // dD
+ movq ARG5, %r13 // T
+
+ vxorpd %xmm15, %xmm15, %xmm15
+ movq ARG1, %r10 // n
+ subl $1, %r10d
+ addq $32, %r11
+100:
+ vmovsd 0(%r11), %xmm14
+ vfmadd231sd %xmm14, %xmm14, %xmm15
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 100b
+
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 101f
+ vmovsd %xmm14, 0(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 0(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 0(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 0(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 0(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r14, 1), %ymm1
+ vbroadcastsd 32(%r11), %ymm8
+ vbroadcastsd 64(%r11), %ymm9
+ vbroadcastsd 96(%r11), %ymm10
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 32(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vmovsd %xmm8, 32(%r11)
+ vmovsd %xmm9, 64(%r11)
+ vmovsd %xmm10, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 0(%r11), %ymm8
+ vbroadcastsd 32(%r11), %ymm9
+ vbroadcastsd 64(%r11), %ymm10
+ vbroadcastsd 96(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vmovsd %xmm8, 0(%r11)
+ vmovsd %xmm9, 32(%r11)
+ vmovsd %xmm10, 64(%r11)
+ vmovsd %xmm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 0(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vmovsd %xmm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm15, %ymm1, %ymm1
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmulpd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0x55, %ymm15, %ymm15 // beta
+
+ // second column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 8(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 40(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 40(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 8(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 40(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 32(%r11), %ymm0
+ vmovapd 32(%r11, %r14, 1), %ymm1
+ vbroadcastsd 72(%r11), %ymm9
+ vbroadcastsd 104(%r11), %ymm10
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vmovsd %xmm9, 72(%r11)
+ vmovsd %xmm10, 104(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 8(%r11), %ymm8
+ vbroadcastsd 40(%r11), %ymm9
+ vbroadcastsd 72(%r11), %ymm10
+ vbroadcastsd 104(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vmovsd %xmm8, 8(%r11)
+ vmovsd %xmm9, 40(%r11)
+ vmovsd %xmm10, 72(%r11)
+ vmovsd %xmm11, 104(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 8(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vmovsd %xmm8, 8(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC02(%rip), %ymm12
+#else
+ vmovapd LC02(%rip), %ymm12
+#endif
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmulpd %ymm15, %ymm1, %ymm1
+ vmovsd %xmm0, 32(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x3, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmulpd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vbroadcastsd 40(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xaa, %ymm15, %ymm15 // beta
+
+ // third column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 16(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 80(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 80(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 16(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 80(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 64(%r11), %ymm0
+ vmovapd 64(%r11, %r14, 1), %ymm1
+ vbroadcastsd 112(%r11), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm10, %ymm1
+ vmovsd %xmm10, 112(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 16(%r11), %ymm8
+ vbroadcastsd 48(%r11), %ymm9
+ vbroadcastsd 80(%r11), %ymm10
+ vbroadcastsd 112(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vmovsd %xmm8, 16(%r11)
+ vmovsd %xmm9, 48(%r11)
+ vmovsd %xmm10, 80(%r11)
+ vmovsd %xmm11, 112(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 16(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vmovsd %xmm8, 16(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+ vblendpd $0x7, %ymm15, %ymm0, %ymm0
+ vbroadcastsd 80(%r13), %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmovapd %xmm0, 64(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x7, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vaddpd %ymm0, %ymm8, %ymm8
+ vaddpd %ymm1, %ymm9, %ymm9
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ //
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vbroadcastsd 48(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 32(%r11)
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ //
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 80(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 64(%r11)
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 96(%r11)
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xff, %ymm15, %ymm15 // beta
+
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 24(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 120(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 120(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 24(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 120(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 96(%r11), %ymm0
+ vmovapd 96(%r11, %r14, 1), %ymm1
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 24(%r11), %ymm8
+ vbroadcastsd 56(%r11), %ymm9
+ vbroadcastsd 88(%r11), %ymm10
+ vbroadcastsd 120(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 32(%r11, %r14, 1), %ymm9, %ymm1
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 64(%r11, %r14, 1), %ymm10, %ymm1
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vfmadd231pd 96(%r11, %r14, 1), %ymm11, %ymm1
+ vmovsd %xmm8, 24(%r11)
+ vmovsd %xmm9, 56(%r11)
+ vmovsd %xmm10, 88(%r11)
+ vmovsd %xmm11, 120(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 24(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 0(%r11, %r14, 1), %ymm8, %ymm1
+ vmovsd %xmm8, 24(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm15
+
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm15
+
+ vbroadcastsd 120(%r13), %ymm14
+ vmulpd %ymm14, %ymm15, %ymm15
+ vmulpd %ymm14, %ymm1, %ymm1
+ vmovapd 96(%r13), %ymm0
+ vblendpd $0x7, %ymm15, %ymm0, %ymm0
+ vmovapd %ymm0, 96(%r13)
+
+ movq ARG2, %r11 // D
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vaddpd %ymm1, %ymm9, %ymm9
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ //
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ //
+ vmovapd 32(%r11, %r14, 1), %ymm9
+ vbroadcastsd 56(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm9, 32(%r11, %r14, 1)
+ //
+ vmovapd 64(%r11, %r14, 1), %ymm9
+ vbroadcastsd 88(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm9, 64(%r11, %r14, 1)
+ //
+ vmovapd 96(%r11, %r14, 1), %ymm9
+ vbroadcastsd 120(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm9, 96(%r11, %r14, 1)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11, %r14, 1), %ymm9
+ vbroadcastsd 24(%r11), %ymm14
+ vfmadd231pd %ymm1, %ymm14, %ymm9
+ vmovapd %ymm9, 0(%r11, %r14, 1)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+
+102:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgelqf_dlarft4_8_lib4, .-kernel_dgelqf_dlarft4_8_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT, double *beta)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgelqf_dlarft4_4_lib4
+ .type kernel_dgelqf_dlarft4_4_lib4, @function
+kernel_dgelqf_dlarft4_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgelqf_dlarft4_4_lib4
+_kernel_dgelqf_dlarft4_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgelqf_dlarft4_4_lib4
+ .def kernel_dgelqf_dlarft4_4_lib4; .scl 2; .type 32; .endef
+kernel_dgelqf_dlarft4_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero T
+
+ movq ARG4, %r10 // T
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm15, 0(%r10)
+ vmovapd %ymm15, 32(%r10)
+ vmovapd %ymm15, 64(%r10)
+ vmovapd %ymm15, 96(%r10)
+
+ // first column
+
+ movq ARG2, %r11 // D
+ movq ARG3, %r12 // dD
+ movq ARG4, %r13 // T
+
+ vxorpd %xmm15, %xmm15, %xmm15
+ movq ARG1, %r10 // n
+ subl $1, %r10d
+ addq $32, %r11
+100:
+ vmovsd 0(%r11), %xmm14
+ vfmadd231sd %xmm14, %xmm14, %xmm15
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 100b
+
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 101f
+ vmovsd %xmm14, 0(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 0(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 0(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 0(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 0(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 0(%r11), %ymm0
+ vbroadcastsd 32(%r11), %ymm8
+ vbroadcastsd 64(%r11), %ymm9
+ vbroadcastsd 96(%r11), %ymm10
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 32(%r11), %ymm8, %ymm0
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vmovsd %xmm8, 32(%r11)
+ vmovsd %xmm9, 64(%r11)
+ vmovsd %xmm10, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 0(%r11), %ymm8
+ vbroadcastsd 32(%r11), %ymm9
+ vbroadcastsd 64(%r11), %ymm10
+ vbroadcastsd 96(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vmovsd %xmm8, 0(%r11)
+ vmovsd %xmm9, 32(%r11)
+ vmovsd %xmm10, 64(%r11)
+ vmovsd %xmm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 0(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vmovsd %xmm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vbroadcastsd 0(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ vmovapd 0(%r11), %ymm8
+ vmovapd 32(%r11), %ymm9
+ vmovapd 64(%r11), %ymm10
+ vmovapd 96(%r11), %ymm11
+ vaddpd %ymm0, %ymm8, %ymm8
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm9
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm10
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm11
+ vmulpd %ymm10, %ymm10, %ymm15
+ vfmadd231pd %ymm11, %ymm11, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 32(%r11)
+ vmovapd %ymm10, 64(%r11)
+ vmovapd %ymm11, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 32(%r11), %ymm9
+ vmovapd 64(%r11), %ymm10
+ vmovapd 96(%r11), %ymm11
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vbroadcastsd 32(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm9
+ vbroadcastsd 64(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm10
+ vbroadcastsd 96(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm11
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vfmadd231pd %ymm11, %ymm11, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 32(%r11)
+ vmovapd %ymm10, 64(%r11)
+ vmovapd %ymm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0x55, %ymm15, %ymm15 // beta
+
+ // second column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 8(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 40(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 40(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 8(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 40(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 32(%r11), %ymm0
+ vbroadcastsd 72(%r11), %ymm9
+ vbroadcastsd 104(%r11), %ymm10
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 64(%r11), %ymm9, %ymm0
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vmovsd %xmm9, 72(%r11)
+ vmovsd %xmm10, 104(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 8(%r11), %ymm8
+ vbroadcastsd 40(%r11), %ymm9
+ vbroadcastsd 72(%r11), %ymm10
+ vbroadcastsd 104(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vmovsd %xmm8, 8(%r11)
+ vmovsd %xmm9, 40(%r11)
+ vmovsd %xmm10, 72(%r11)
+ vmovsd %xmm11, 104(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 8(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vmovsd %xmm8, 8(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC02(%rip), %ymm12
+#else
+ vmovapd LC02(%rip), %ymm12
+#endif
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm0
+ vbroadcastsd 40(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmovsd %xmm0, 32(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x3, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ vmovapd 32(%r11), %ymm9
+ vmovapd 64(%r11), %ymm10
+ vmovapd 96(%r11), %ymm11
+ vaddpd %ymm0, %ymm9, %ymm9
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm10
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm11
+ vmulpd %ymm11, %ymm11, %ymm15
+ vmovapd %ymm9, 32(%r11)
+ vmovapd %ymm10, 64(%r11)
+ vmovapd %ymm11, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 32(%r11), %ymm9
+ vmovapd 64(%r11), %ymm10
+ vmovapd 96(%r11), %ymm11
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vbroadcastsd 40(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm9
+ vbroadcastsd 72(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm10
+ vbroadcastsd 104(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm11
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vfmadd231pd %ymm11, %ymm11, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 32(%r11)
+ vmovapd %ymm10, 64(%r11)
+ vmovapd %ymm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 8(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xaa, %ymm15, %ymm15 // beta
+
+ // third column
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 16(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 80(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 80(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 16(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 80(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 64(%r11), %ymm0
+ vbroadcastsd 112(%r11), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vfmadd231pd 96(%r11), %ymm10, %ymm0
+ vmovsd %xmm10, 112(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 16(%r11), %ymm8
+ vbroadcastsd 48(%r11), %ymm9
+ vbroadcastsd 80(%r11), %ymm10
+ vbroadcastsd 112(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vmovsd %xmm8, 16(%r11)
+ vmovsd %xmm9, 48(%r11)
+ vmovsd %xmm10, 80(%r11)
+ vmovsd %xmm11, 112(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 16(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vmovsd %xmm8, 16(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm1
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm1
+ vblendpd $0x7, %ymm1, %ymm0, %ymm0
+ vbroadcastsd 80(%r13), %ymm15
+ vmulpd %ymm15, %ymm0, %ymm0
+ vmovapd %xmm0, 64(%r13)
+
+ vxorpd %ymm12, %ymm12, %ymm12
+ vblendpd $0x7, %ymm12, %ymm0, %ymm0
+
+ movq ARG2, %r11 // D
+ vmovapd 64(%r11), %ymm10
+ vmovapd 96(%r11), %ymm11
+ vaddpd %ymm0, %ymm10, %ymm10
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm11
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd %ymm10, 64(%r11)
+ vmovapd %ymm11, 96(%r11)
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 110f
+106:
+ vmovapd 0(%r11), %ymm8
+ vmovapd 32(%r11), %ymm9
+ vmovapd 64(%r11), %ymm10
+ vmovapd 96(%r11), %ymm11
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vbroadcastsd 48(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm9
+ vbroadcastsd 80(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm10
+ vbroadcastsd 112(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm11
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vfmadd231pd %ymm9, %ymm9, %ymm15
+ vfmadd231pd %ymm10, %ymm10, %ymm15
+ vfmadd231pd %ymm11, %ymm11, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ vmovapd %ymm9, 32(%r11)
+ vmovapd %ymm10, 64(%r11)
+ vmovapd %ymm11, 96(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 106b
+110:
+ cmpl $0, %r10d
+ jle 107f
+108:
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 16(%r11), %ymm14
+ vfmadd231pd %ymm0, %ymm14, %ymm8
+ vfmadd231pd %ymm8, %ymm8, %ymm15
+ vmovapd %ymm8, 0(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 108b
+107:
+ vpermpd $0xff, %ymm15, %ymm15 // beta
+
+102:
+ vxorpd %xmm14, %xmm14, %xmm14
+ vucomisd %xmm14, %xmm15
+ jne 101f
+// jp 111f
+ vmovsd %xmm14, 24(%r12)
+ jmp 102f
+
+101:
+ movq ARG2, %r11 // D
+ vmovsd 120(%r11), %xmm14 // alpha
+ vfmadd231sd %xmm14, %xmm14, %xmm15 // beta
+ vsqrtsd %xmm15, %xmm15, %xmm15 // beta
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC00(%rip), %xmm13 // mask
+#else
+ vmovsd LC00(%rip), %xmm13 // mask
+#endif
+ vandpd %xmm13, %xmm14, %xmm12
+ vxorpd %xmm13, %xmm12, %xmm12
+ vxorpd %xmm12, %xmm15, %xmm15 // beta
+ vmovsd %xmm15, 120(%r11) // pD[0+ps*0]
+ vsubsd %xmm14, %xmm15, %xmm14 // beta-alpha
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC01(%rip), %xmm12
+#else
+ vmovapd LC01(%rip), %xmm12
+#endif
+ vmovsd %xmm14, %xmm12, %xmm12
+ vmovddup %xmm14, %xmm14
+ vmovsd %xmm15, %xmm14, %xmm14
+ vdivpd %xmm14, %xmm12, %xmm14
+ vmovsd %xmm14, 24(%r12) // dD[0]
+ vxorpd %xmm13, %xmm14, %xmm12
+ vmovsd %xmm12, 120(%r13) // pT[0+ps*0]
+
+ vpermpd $0x55, %ymm14, %ymm15 // tmp
+
+ vmovapd 96(%r11), %ymm0
+ movq ARG1, %r10 // n
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jle 109f
+103:
+ vbroadcastsd 24(%r11), %ymm8
+ vbroadcastsd 56(%r11), %ymm9
+ vbroadcastsd 88(%r11), %ymm10
+ vbroadcastsd 120(%r11), %ymm11
+ vmulpd %ymm15, %ymm8, %ymm8
+ vmulpd %ymm15, %ymm9, %ymm9
+ vmulpd %ymm15, %ymm10, %ymm10
+ vmulpd %ymm15, %ymm11, %ymm11
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vfmadd231pd 32(%r11), %ymm9, %ymm0
+ vfmadd231pd 64(%r11), %ymm10, %ymm0
+ vfmadd231pd 96(%r11), %ymm11, %ymm0
+ vmovsd %xmm8, 24(%r11)
+ vmovsd %xmm9, 56(%r11)
+ vmovsd %xmm10, 88(%r11)
+ vmovsd %xmm11, 120(%r11)
+ subl $4, %r10d
+ addq $128, %r11
+ cmpl $3, %r10d
+ jg 103b
+109:
+ cmpl $0, %r10d
+ jle 104f
+105:
+ vbroadcastsd 24(%r11), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vfmadd231pd 0(%r11), %ymm8, %ymm0
+ vmovsd %xmm8, 24(%r11)
+ subl $1, %r10d
+ addq $32, %r11
+ cmpl $0, %r10d
+ jg 105b
+104:
+
+ vxorpd %xmm12, %xmm12, %xmm12
+
+ vmovapd 0(%r13), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm14
+ vmulpd %ymm14, %ymm0, %ymm1
+
+ vmovapd 32(%r13), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm1
+
+ vmovapd 64(%r13), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfmadd231pd %ymm14, %ymm13, %ymm1
+
+ vbroadcastsd 120(%r13), %ymm15
+ vmulpd %ymm15, %ymm1, %ymm1
+ vmovapd 96(%r13), %ymm0
+ vblendpd $0x7, %ymm1, %ymm0, %ymm0
+ vmovapd %ymm0, 96(%r13)
+
+102:
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgelqf_dlarft4_4_lib4, .-kernel_dgelqf_dlarft4_4_lib4
+#endif
+
+
+
+
+
+// 1 2
+// void kernel_dgelqf_dlarft_12_12_lib4(double *dK, double *pT)
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb_12_lib4
+ .type kernel_dlarfb_12_lib4, @function
+kernel_dlarfb_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb_12_lib4
+_kernel_dlarfb_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb_12_lib4
+ .def kernel_dlarfb_12_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb_12_lib4:
+#endif
+
+ PROLOGUE
+
+ movq ARG1, %r10 // K
+ movq ARG2, %r11 // T
+ movq $384, %r12 // sdt !!!!!!!!!!!!!!!!!!!!!!!!!
+
+ //
+ vmovapd 352(%r10), %ymm12
+ vbroadcastsd 376(%r11, %r12, 2), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm11
+ //
+ vmovapd 320(%r10), %ymm12
+ vbroadcastsd 368(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 336(%r11, %r12, 2), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm10
+ //
+ vmovapd 288(%r10), %ymm12
+ vbroadcastsd 360(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 328(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 296(%r11, %r12, 2), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm9
+ //
+ vmovapd 256(%r10), %ymm12
+ vbroadcastsd 352(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 320(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 288(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 256(%r11, %r12, 2), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm8
+ //
+ vmovapd 224(%r10), %ymm12
+ vbroadcastsd 376(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 344(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 312(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 280(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 248(%r11, %r12, 1), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm7
+ //
+ vmovapd 192(%r10), %ymm12
+ vbroadcastsd 368(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 336(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 304(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 272(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 240(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 208(%r11, %r12, 1), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm6
+ //
+ vmovapd 160(%r10), %ymm12
+ vbroadcastsd 360(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 328(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 296(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 264(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 232(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 200(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 168(%r11, %r12, 1), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm5
+ //
+ vmovapd 128(%r10), %ymm12
+ vbroadcastsd 352(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 320(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 288(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 256(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 224(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 192(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 160(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 128(%r11, %r12, 1), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm4
+ //
+ vmovapd 96(%r10), %ymm12
+ vbroadcastsd 376(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 344(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 312(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 280(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 248(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 216(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 184(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 152(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 120(%r11), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm3
+ //
+ vmovapd 64(%r10), %ymm12
+ vbroadcastsd 368(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 336(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 304(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 272(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 240(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 208(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 176(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 144(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 112(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 80(%r11), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm2
+ //
+ vmovapd 32(%r10), %ymm12
+ vbroadcastsd 360(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 328(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 296(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 264(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 232(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 200(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 168(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 136(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 104(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 40(%r11), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm1
+ //
+ vmovapd 0(%r10), %ymm12
+ vbroadcastsd 352(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ vbroadcastsd 320(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 288(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 256(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 224(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 192(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 160(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 128(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 96(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm12, %ymm13, %ymm0
+
+ vmovapd %ymm11, 352(%r10)
+ vmovapd %ymm10, 320(%r10)
+ vmovapd %ymm9, 288(%r10)
+ vmovapd %ymm8, 256(%r10)
+ vmovapd %ymm7, 224(%r10)
+ vmovapd %ymm6, 192(%r10)
+ vmovapd %ymm5, 160(%r10)
+ vmovapd %ymm4, 128(%r10)
+ vmovapd %ymm3, 96(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm0, 0(%r10)
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb_12_lib4, .-kernel_dlarfb_12_lib4
+#endif
+
+
+
+
+
+// read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 100...0 100...0 100...0 100...0 }
+#elif defined(OS_MAC)
+LC00: // { 100...0 100...0 100...0 100...0 }
+ .align 5
+#endif
+ .long 0x00000000
+ .long 0x80000000
+ .long 0x00000000
+ .long 0x80000000
+ .long 0x00000000
+ .long 0x80000000
+ .long 0x00000000
+ .long 0x80000000
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01:
+#elif defined(OS_MAC)
+LC01:
+ .align 5
+#endif
+ .double -1.0
+ .double -1.0
+ .double -1.0
+ .double -1.0
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02:
+#elif defined(OS_MAC)
+LC02:
+ .align 5
+#endif
+ .double 1.0
+ .double 1.0
+ .double 1.0
+ .double 1.0
+
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgelqf_4_lib4_bkp.c b/kernel/avx2/kernel_dgelqf_4_lib4_bkp.c
new file mode 100644
index 0000000..05c2d2e
--- /dev/null
+++ b/kernel/avx2/kernel_dgelqf_4_lib4_bkp.c
@@ -0,0 +1,282 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+#include "../../include/blasfeo_d_kernel.h"
+
+
+
+// assume n>=4
+void kernel_dgelqf_dlarft_4_lib4(int n, double *pD, double *dD, double *pT)
+ {
+ return;
+ int ii, jj, ll;
+ double alpha, beta, tmp, w0, w1, w2, w3;
+ const int ps = 4;
+ // zero tau matrix
+ for(ii=0; ii<16; ii++)
+ pT[ii] = 0.0;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[0] = 0.0;
+ goto col2;
+ }
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[0] = (beta-alpha) / beta;
+ pT[0+ps*0] = - dD[0];
+ tmp = -1.0 / (beta-alpha);
+ //
+ pD[0+ps*0] = beta;
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ //
+ pD[0+ps*1] *= tmp;
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ //
+ pD[0+ps*2] *= tmp;
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ //
+ pD[0+ps*3] *= tmp;
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ //
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ //
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ //
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ //
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ beta = pD[1+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ beta += pD[1+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ beta += pD[1+ps*ii] * pD[1+ps*ii];
+ }
+ // second column
+col2:
+ if(beta==0.0)
+ {
+ dD[1] = 0.0;
+ tmp = 0.0;
+ goto col3;
+ }
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[1] = (beta-alpha) / beta;
+ pT[1+ps*1] = - dD[1];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[1+ps*1] = beta;
+ w0 = pD[0+ps*1]; //
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ //
+ pD[1+ps*2] *= tmp;
+ w0 += pD[0+ps*2] * pD[1+ps*2]; //
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[1+ps*3]; //
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ //
+ pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ //
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ //
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ //
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ beta = pD[2+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ beta += pD[2+ps*ii] * pD[2+ps*ii];
+ }
+ // third column
+col3:
+ if(beta==0.0)
+ {
+ dD[2] = 0.0;
+ tmp = 0.0;
+ goto col4;
+ }
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[2] = (beta-alpha) / beta;
+ pT[2+ps*2] = - dD[2];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[2+ps*2] = beta;
+ w0 = pD[0+ps*2];
+ w1 = pD[1+ps*2];
+ w3 = pD[3+ps*2];
+ //
+ pD[2+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[2+ps*3];
+ w1 += pD[1+ps*3] * pD[2+ps*3];
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[2+ps*ii];
+ w1 += pD[1+ps*ii] * pD[2+ps*ii];
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ //
+ pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+ pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+ w3 = - dD[2] * w3;
+//printf("\n%f %f %f\n", pT[0+ps*2], pT[1+ps*2], w3);
+//return;
+ //
+ pD[3+ps*2] += w3;
+ //
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ //
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ beta += pD[3+ps*ii] * pD[3+ps*ii];
+ }
+ // fourth column
+col4:
+ if(beta==0.0)
+ {
+ dD[3] = 0.0;
+ tmp = 0.0;
+ return;
+ }
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[3] = (beta-alpha) / beta;
+ pT[3+ps*3] = - dD[3];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[3+ps*3] = beta;
+ w0 = pD[0+ps*3];
+ w1 = pD[1+ps*3];
+ w2 = pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[3+ps*ii];
+ w1 += pD[1+ps*ii] * pD[3+ps*ii];
+ w2 += pD[2+ps*ii] * pD[3+ps*ii];
+ }
+ //
+ pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+ pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+ pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+ return;
+ }
+
+
+
+
diff --git a/kernel/avx2/kernel_dgemm_12x4_lib4.S b/kernel/avx2/kernel_dgemm_12x4_lib4.S
new file mode 100644
index 0000000..766cb92
--- /dev/null
+++ b/kernel/avx2/kernel_dgemm_12x4_lib4.S
@@ -0,0 +1,15536 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_12x4_lib4, @function
+inner_kernel_dgemm_add_nt_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_12x4_lib4:
+#endif
+#endif
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq $128, %r13
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 0(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq $128, %r13
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+// vmovapd 0(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+// vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A2[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ addq $32, %r11
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ subl $1, %r10d
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ vmovapd 0(%r11), %ymm12 // A0[0]
+ vmovapd 0(%r13), %ymm15 // B[0]
+ vmovapd 0(%r11, %r12, 1), %ymm13 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm14 // A2[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ subl $4, %r10d
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 32(%r11), %ymm12 // A0[4]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 32(%r11, %r12, 1), %ymm13 // A1[4]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 32(%r13), %ymm15 // B[4]
+ vmovapd 32(%r11, %r12, 2), %ymm14 // A2[4]
+
+ // unroll 1
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 64(%r11), %ymm12 // A0[8]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 64(%r11, %r12, 1), %ymm13 // A1[8]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 64(%r13), %ymm15 // B[8]
+ vmovapd 64(%r11, %r12, 2), %ymm14 // A2[8]
+
+
+ // unroll 2
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 96(%r11), %ymm12 // A0[12]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 96(%r11, %r12, 1), %ymm13 // A1[12]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 96(%r13), %ymm15 // B[12]
+ vmovapd 96(%r11, %r12, 2), %ymm14 // A2[12]
+
+
+ // unroll 3
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ addq $128, %r11
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ addq $128, %r13
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 0(%r11), %ymm12 // A0[0]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 0(%r11, %r12, 1), %ymm13 // A1[0]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 0(%r13), %ymm15 // B[0]
+ vmovapd 0(%r11, %r12, 2), %ymm14 // A2[0]
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ subl $4, %r10d
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 32(%r11), %ymm12 // A0[4]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 32(%r11, %r12, 1), %ymm13 // A1[4]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 32(%r13), %ymm15 // B[4]
+ vmovapd 32(%r11, %r12, 2), %ymm14 // A2[4]
+
+ // unroll 1
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 64(%r11), %ymm12 // A0[8]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 64(%r11, %r12, 1), %ymm13 // A1[8]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 64(%r13), %ymm15 // B[8]
+ vmovapd 64(%r11, %r12, 2), %ymm14 // A2[8]
+
+
+ // unroll 2
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 96(%r11), %ymm12 // A0[12]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vmovapd 96(%r11, %r12, 1), %ymm13 // A1[12]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 96(%r13), %ymm15 // B[12]
+ vmovapd 96(%r11, %r12, 2), %ymm14 // A2[12]
+
+
+ // unroll 3
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ addq $128, %r11
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ addq $128, %r13
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+// cmpl $4, %r10d
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+// vmovapd 0(%r11), %ymm12 // A0[0]
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+// vmovapd 0(%r11, %r12, 1), %ymm13 // A1[0]
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+// vmovapd 0(%r13), %ymm15 // B[0]
+// vmovapd 0(%r11, %r12, 2), %ymm14 // A2[0]
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm12 // A0[4]
+ vmovapd 0(%r11, %r12, 1), %ymm13 // A1[4]
+ vmovapd 0(%r13), %ymm15 // B[4]
+ vmovapd 0(%r11, %r12, 2), %ymm14 // A2[4]
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ addq $32, %r11
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+ addq $32, %r13
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+
+ vperm2f128 $0x1, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ subl $1, %r10d
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+ vshufpd $0x5, %ymm15, %ymm15, %ymm15
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_12x4_lib4, .-inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_12x4_lib4, @function
+inner_kernel_dgemm_sub_nt_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_12x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq $128, %r13
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 0(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq $128, %r13
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+// vmovapd 0(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+// vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A2[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+ addq $32, %r11
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+ subl $1, %r10d
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_12x4_lib4, .-inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_12x4_lib4, @function
+inner_kernel_dgemm_add_nn_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_12x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq %r14, %r13
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 0(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq %r14, %r13
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+// vmovapd 0(%r11), %ymm13 // A0
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+// vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A2[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_12x4_lib4, .-inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nn_12x4_lib4, @function
+inner_kernel_dgemm_sub_nn_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nn_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_12x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq %r14, %r13
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 0(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+ subl $4, %r10d
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vmovapd 96(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vmovapd 96(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ vmovapd 96(%r11, %r12, 2), %ymm15 // A1
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 120(%r13), %ymm12
+ addq %r14, %r13
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+// vmovapd 0(%r11), %ymm13 // A0
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A1
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+// vmovapd 0(%r11, %r12, 2), %ymm15 // A1
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm13 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A1[0]
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A2[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vfnmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vfnmadd231pd %ymm15, %ymm12, %ymm9
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vfnmadd231pd %ymm15, %ymm12, %ymm10
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ vfnmadd231pd %ymm15, %ymm12, %ymm11
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nn_12x4_lib4, .-inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x12_lib4, @function
+inner_kernel_dgemm_add_nn_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x12_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+ prefetcht0 128(%r12, %r13, 2) // software prefetch
+ prefetcht0 192(%r12, %r13, 2) // software prefetch
+ prefetcht0 256(%r12, %r13, 2) // software prefetch
+ prefetcht0 320(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ vbroadcastsd 256(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm8
+ vbroadcastsd 288(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm9
+ vbroadcastsd 320(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm10
+ vbroadcastsd 352(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm11
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vbroadcastsd 264(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm8
+ vbroadcastsd 296(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm9
+ vbroadcastsd 328(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm10
+ vbroadcastsd 360(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm11
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ vbroadcastsd 272(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm8
+ vbroadcastsd 304(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm9
+ vbroadcastsd 336(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm10
+ vbroadcastsd 368(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm11
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vbroadcastsd 280(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm8
+ vbroadcastsd 312(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm9
+ vbroadcastsd 344(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm10
+ vbroadcastsd 376(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm11
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ vbroadcastsd 256(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm8
+ vbroadcastsd 288(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm9
+ vbroadcastsd 320(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm10
+ vbroadcastsd 352(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm11
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vbroadcastsd 264(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm8
+ vbroadcastsd 296(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm9
+ vbroadcastsd 328(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm10
+ vbroadcastsd 360(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm11
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ vbroadcastsd 272(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm8
+ vbroadcastsd 304(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm9
+ vbroadcastsd 336(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm10
+ vbroadcastsd 368(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm11
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vbroadcastsd 280(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm8
+ vbroadcastsd 312(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm9
+ vbroadcastsd 344(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm10
+ vbroadcastsd 376(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm11
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ vbroadcastsd 256(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm8
+ vbroadcastsd 288(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm9
+ vbroadcastsd 320(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm10
+ vbroadcastsd 352(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm11
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x12_lib4, .-inner_kernel_dgemm_add_nn_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- B
+// r12 <- C
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+// ymm4 <-
+// ymm5 <-
+// ymm6 <-
+// ymm7 <-
+// ymm8 <-
+// ymm9 <-
+// ymm10 <-
+// ymm11 <-
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- ?
+// r12 <- ?
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+// ymm4 <-
+// ymm5 <-
+// ymm6 <-
+// ymm7 <-
+// ymm8 <-
+// ymm9 <-
+// ymm10 <-
+// ymm11 <-
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_12x4_lib4, @function
+inner_kernel_dgebp_add_nn_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_12x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_12x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vmovapd 0(%r12, %r13, 2), %ymm15
+ vbroadcastsd 0(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd 8(%r11), %ymm13
+ subl $4, %r10d
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vbroadcastsd 16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vfmadd231pd %ymm10, %ymm13, %ymm15
+ vbroadcastsd 24(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm15
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+ vmovapd %ymm15, 0(%r12, %r13, 2)
+
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vmovapd 32(%r12, %r13, 2), %ymm15
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd 40(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vbroadcastsd 48(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vfmadd231pd %ymm10, %ymm13, %ymm15
+ vbroadcastsd 56(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm15
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+ vmovapd %ymm15, 32(%r12, %r13, 2)
+
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vmovapd 64(%r12, %r13, 2), %ymm15
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vbroadcastsd 80(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vfmadd231pd %ymm10, %ymm13, %ymm15
+ vbroadcastsd 88(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm15
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+ vmovapd %ymm15, 64(%r12, %r13, 2)
+
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vmovapd 96(%r12, %r13, 2), %ymm15
+ vbroadcastsd 96(%r11), %ymm13
+ addq $128, %r11
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd -24(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vbroadcastsd -16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vfmadd231pd %ymm10, %ymm13, %ymm15
+ vbroadcastsd -8(%r11), %ymm13
+ addq $128, %r12
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm15
+ vmovapd %ymm12, -32(%r12)
+ vmovapd %ymm14, -32(%r12, %r13, 1)
+ vmovapd %ymm15, -32(%r12, %r13, 2)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vmovapd 0(%r12, %r13, 2), %ymm15
+ vbroadcastsd 0(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd 8(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vbroadcastsd 16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vfmadd231pd %ymm10, %ymm13, %ymm15
+ vbroadcastsd 24(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm15
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+ vmovapd %ymm15, 0(%r12, %r13, 2)
+
+ addq $32, %r11
+ addq $32, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_12x4_lib4, .-inner_kernel_dgebp_add_nn_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 32*sdb
+// r14 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <-
+// ymm5 <-
+// ymm6 <-
+// ymm7 <-
+// ymm8 <-
+// ymm9 <-
+// ymm10 <-
+// ymm11 <-
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A
+// r12 <- B+?
+// r13 <- 32*sdb
+// r14 <- C+?
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <-
+// ymm5 <-
+// ymm6 <-
+// ymm7 <-
+// ymm8 <-
+// ymm9 <-
+// ymm10 <-
+// ymm11 <-
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_4x12_lib4, @function
+inner_kernel_dgebp_add_nn_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_4x12_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_4x12_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $11, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+ vmovapd 128(%r14), %ymm4
+ vmovapd 160(%r14), %ymm5
+ vmovapd 192(%r14), %ymm6
+ vmovapd 224(%r14), %ymm7
+ vmovapd 256(%r14), %ymm8
+ vmovapd 288(%r14), %ymm9
+ vmovapd 320(%r14), %ymm10
+ vmovapd 352(%r14), %ymm11
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 152(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+ vmovapd %ymm4, 128(%r14)
+ vmovapd %ymm5, 160(%r14)
+ vmovapd %ymm6, 192(%r14)
+ vmovapd %ymm7, 224(%r14)
+ vmovapd %ymm8, 256(%r14)
+ vmovapd %ymm9, 288(%r14)
+ vmovapd %ymm10, 320(%r14)
+ vmovapd %ymm11, 352(%r14)
+
+ addq $384, %r12
+ addq $384, %r14
+ subl $12, %r10d
+
+ cmpl $11, %r10d
+ jg 1b // main loop
+
+2:
+ cmpl $3, %r10d
+ jle 2f // return
+
+ // cleanup loop
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 72(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 48(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 112(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 56(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 88(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+
+ addq $128, %r12
+ addq $128, %r14
+ subl $4, %r10d
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+2:
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+1:
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vbroadcastsd 0(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vbroadcastsd 8(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vbroadcastsd 16(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vbroadcastsd 24(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+
+ addq $32, %r12
+ addq $32, %r14
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 1b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_4x12_lib4, .-inner_kernel_dgebp_add_nn_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_12x4_lib4, @function
+inner_edge_dgemm_add_nn_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_12x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %ebx
+ subl %r15d, %ebx // 4-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,4-offsetB)
+
+ movl %r15d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r13 // B+offsetB*sizeof(double)
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(double)
+
+ movq %rax, %rbp // A2 <- A1
+ addq %r12, %rbp // A2 <- A1 + 4*sda*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm12 // A0[0]
+ vmovapd 0(%rax), %ymm14 // A1[0]
+ vmovapd 0(%rbp), %ymm15 // A2[0]
+ vbroadcastsd 0(%r13), %ymm13 // B[0]
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vfmadd231pd %ymm14, %ymm13, %ymm4
+ vfmadd231pd %ymm15, %ymm13, %ymm8
+ vbroadcastsd 32(%r13), %ymm13 // B[1]
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vfmadd231pd %ymm14, %ymm13, %ymm5
+ vfmadd231pd %ymm15, %ymm13, %ymm9
+ vbroadcastsd 64(%r13), %ymm13 // B[2]
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vfmadd231pd %ymm14, %ymm13, %ymm6
+ vfmadd231pd %ymm15, %ymm13, %ymm10
+ vbroadcastsd 96(%r13), %ymm13 // B[3]
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vfmadd231pd %ymm14, %ymm13, %ymm7
+ vfmadd231pd %ymm15, %ymm13, %ymm11
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A0+1*bs*sizeof(float)
+ addq $32, %rax // A1+1*bs*sizeof(float)
+ addq $32, %rbp // A2+1*bs*sizeof(float)
+ addq $8, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_12x4_lib4, .-inner_edge_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x12_lib4, @function
+inner_edge_dgemm_add_nn_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x12_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x12_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x12_lib4, .-inner_edge_dgemm_add_nn_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_12x4_lib4, @function
+inner_edge_dtrmm_nt_ru_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_12x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // A1 <- A0
+ addq %r11, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ movq %r15, %r14 // A2 <- A1
+ addq %r11, %r14 // A2 <- A1 + 4*sda*sizeof(double)
+
+ vbroadcastsd 0(%r12), %ymm12
+ vmovapd 0(%r10), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 32(%r10), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vbroadcastsd 64(%r12), %ymm12
+ vmovapd 64(%r10), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 64(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 72(%r12), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 80(%r12), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vbroadcastsd 96(%r12), %ymm12
+ vmovapd 96(%r10), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 96(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 96(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 104(%r12), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 112(%r12), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 120(%r12), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ addq $128, %r10
+ addq $128, %r12
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_12x4_lib4, .-inner_edge_dtrmm_nt_ru_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*4*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_12x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_12x4_vs_lib4:
+#endif
+#endif
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ movq %r15, %r14 // A2 <- A1
+ addq %r12, %r14 // A2 <- A1 + 4*sda*sizeof(double)
+
+ vbroadcastsd 0(%r13), %ymm12
+ addq $32, %r13
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm13
+ addq $32, %r11
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm14
+ addq $32, %r15
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r14), %ymm15
+ addq $32, %r14
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ addq $32, %r11
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ addq $32, %r13
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ addq $32, %r15
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ addq $32, %r11
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ addq $32, %r13
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ addq $32, %r15
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ addq $32, %r14
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm13
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r14), %ymm15
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ addq $32, %r11
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ addq $32, %r13
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $32, %r15
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+ addq $32, %r14
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_12x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d40 d50 d60 d70]
+// ymm9 <- [d41 d51 d61 d71]
+// ymm10 <- [d42 d52 d62 d72]
+// ymm11 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_12x4_lib4, @function
+inner_edge_dtrmm_nn_rl_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_12x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d
+ jg 0f
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vmovapd 32(%r11), %ymm13
+ vmovapd 32(%r11, %r12, 1), %ymm14
+ vmovapd 32(%r11, %r12, 2), %ymm15
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vmovapd 64(%r11), %ymm13
+ vmovapd 64(%r11, %r12, 1), %ymm14
+ vmovapd 64(%r11, %r12, 2), %ymm15
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vmovapd 96(%r11), %ymm13
+ vmovapd 96(%r11, %r12, 1), %ymm14
+ vmovapd 96(%r11, %r12, 2), %ymm15
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r15d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vmovapd 32(%r11), %ymm13
+ vmovapd 32(%r11, %r12, 1), %ymm14
+ vmovapd 32(%r11, %r12, 2), %ymm15
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vmovapd 64(%r11), %ymm13
+ vmovapd 64(%r11, %r12, 1), %ymm14
+ vmovapd 64(%r11, %r12, 2), %ymm15
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A0+3*bs*sizeof(double)
+ addq %r14, %r13
+ subq $8, %r13 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r15d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ vmovapd 32(%r11), %ymm13
+ vmovapd 32(%r11, %r12, 1), %ymm14
+ vmovapd 32(%r11, %r12, 2), %ymm15
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A0+2*bs*sizeof(double)
+ addq %r14, %r13
+ subq $16, %r13 // B+bs*sdb*sizeof(double)-2
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vmovapd 32(%r11), %ymm13
+ vmovapd 32(%r11, %r12, 1), %ymm14
+ vmovapd 32(%r11, %r12, 2), %ymm15
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ vmovapd 64(%r11), %ymm13
+ vmovapd 64(%r11, %r12, 1), %ymm14
+ vmovapd 64(%r11, %r12, 2), %ymm15
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ vmovapd 96(%r11), %ymm13
+ vmovapd 96(%r11, %r12, 1), %ymm14
+ vmovapd 96(%r11, %r12, 2), %ymm15
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-3
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ vmovapd 32(%r11), %ymm13
+ vmovapd 32(%r11, %r12, 1), %ymm14
+ vmovapd 32(%r11, %r12, 2), %ymm15
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ vmovapd 64(%r11), %ymm13
+ vmovapd 64(%r11, %r12, 1), %ymm14
+ vmovapd 64(%r11, %r12, 2), %ymm15
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ vmovapd 96(%r11), %ymm13
+ vmovapd 96(%r11, %r12, 1), %ymm14
+ vmovapd 96(%r11, %r12, 2), %ymm15
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_12x4_lib4, .-inner_edge_dtrmm_nn_rl_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_12x4_vs_lib4, @function
+inner_edge_dtrmm_nn_rl_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_12x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ cmpl $0, %r15d
+ jg 0f // offB>0
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+0:
+ cmpl $1, %r15d
+ jg 1f // offB>1
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+1:
+ cmpl $2, %r15d
+ jg 2f // offB>2
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-2
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm13
+ vmovapd 0(%r11, %r12, 1), %ymm14
+ vmovapd 0(%r11, %r12, 2), %ymm15
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm15, %ymm12, %ymm8
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm15, %ymm12, %ymm9
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vfmadd231pd %ymm15, %ymm12, %ymm10
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ vfmadd231pd %ymm15, %ymm12, %ymm11
+
+ subl $1, %r10d // k-4
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_12x4_vs_lib4, .-inner_edge_dtrmm_nn_rl_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_12x4_lib4, @function
+inner_blend_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_12x4_lib4; .scl 2; .type 32; .endef
+inner_blend_12x4_lib4:
+#endif
+#endif
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm12
+ vblendpd $0x5, %ymm1, %ymm0, %ymm13
+ vblendpd $0xa, %ymm3, %ymm2, %ymm14
+ vblendpd $0x5, %ymm3, %ymm2, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm0
+ vblendpd $0x3, %ymm14, %ymm12, %ymm2
+ vblendpd $0xc, %ymm15, %ymm13, %ymm1
+ vblendpd $0x3, %ymm15, %ymm13, %ymm3
+
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm12
+ vblendpd $0x5, %ymm5, %ymm4, %ymm13
+ vblendpd $0xa, %ymm7, %ymm6, %ymm14
+ vblendpd $0x5, %ymm7, %ymm6, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm4
+ vblendpd $0x3, %ymm14, %ymm12, %ymm6
+ vblendpd $0xc, %ymm15, %ymm13, %ymm5
+ vblendpd $0x3, %ymm15, %ymm13, %ymm7
+
+
+ vblendpd $0xa, %ymm9, %ymm8, %ymm12
+ vblendpd $0x5, %ymm9, %ymm8, %ymm13
+ vblendpd $0xa, %ymm11, %ymm10, %ymm14
+ vblendpd $0x5, %ymm11, %ymm10, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm8
+ vblendpd $0x3, %ymm14, %ymm12, %ymm10
+ vblendpd $0xc, %ymm15, %ymm13, %ymm9
+ vblendpd $0x3, %ymm15, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_12x4_lib4, .-inner_blend_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_12x4_lib4, @function
+inner_tran_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_tran_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_12x4_lib4; .scl 2; .type 32; .endef
+inner_tran_12x4_lib4:
+#endif
+#endif
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+ vunpcklpd %ymm9, %ymm8, %ymm12
+ vunpckhpd %ymm9, %ymm8, %ymm13
+ vunpcklpd %ymm11, %ymm10, %ymm14
+ vunpckhpd %ymm11, %ymm10, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm8
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm10
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm9
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_12x4_lib4, .-inner_tran_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 d63 db3]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 d63 db3]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_12x4_lib4, @function
+inner_scale_11_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_12x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_12x4_lib4:
+#endif
+#endif
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC05(%rip), %ymm14 // beta=1.0
+#else
+ vmovapd LC05(%rip), %ymm14 // beta=1.0
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 0(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 32(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 64(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 96(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_12x4_lib4, .-inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_12x4_lib4, @function
+inner_scale_ab_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_12x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_12x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r10), %ymm15 // beta
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+ movq %r12, %r15 // C1 <- C0
+ addq %r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ movq %r15, %r14 // C2 <- C1
+ addq %r13, %r14 // C2 <- C1 + 4*sdc*sizeof(double)
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 0(%r14), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 32(%r14), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 64(%r14), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 96(%r14), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_12x4_lib4, .-inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- &alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_12x4_lib4, @function
+inner_scale_a0_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_12x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_12x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r10), %ymm15 // beta
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_12x4_lib4, .-inner_scale_a0_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend and scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_12x4_lib4, @function
+inner_blend_scale_ab_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_12x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_12x4_lib4:
+#endif
+#endif
+
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm12
+ vblendpd $0x5, %ymm1, %ymm0, %ymm13
+ vblendpd $0xa, %ymm3, %ymm2, %ymm14
+ vblendpd $0x5, %ymm3, %ymm2, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm0
+ vblendpd $0x3, %ymm14, %ymm12, %ymm2
+ vblendpd $0xc, %ymm15, %ymm13, %ymm1
+ vblendpd $0x3, %ymm15, %ymm13, %ymm3
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm12
+ vblendpd $0x5, %ymm5, %ymm4, %ymm13
+ vblendpd $0xa, %ymm7, %ymm6, %ymm14
+ vblendpd $0x5, %ymm7, %ymm6, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm4
+ vblendpd $0x3, %ymm14, %ymm12, %ymm6
+ vblendpd $0xc, %ymm15, %ymm13, %ymm5
+ vblendpd $0x3, %ymm15, %ymm13, %ymm7
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vblendpd $0xa, %ymm9, %ymm8, %ymm12
+ vblendpd $0x5, %ymm9, %ymm8, %ymm13
+ vblendpd $0xa, %ymm11, %ymm10, %ymm14
+ vblendpd $0x5, %ymm11, %ymm10, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm8
+ vblendpd $0x3, %ymm14, %ymm12, %ymm10
+ vblendpd $0xc, %ymm15, %ymm13, %ymm9
+ vblendpd $0x3, %ymm15, %ymm13, %ymm11
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 0(%r12, %r13, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 32(%r12, %r13, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 64(%r12, %r13, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 96(%r12, %r13, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_12x4_lib4, .-inner_blend_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x12_lib4, @function
+inner_scale_ab_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x12_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x12_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+ vmovapd 128(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+ vmovapd 256(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 288(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 320(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 352(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x12_lib4, .-inner_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x12_lib4, @function
+inner_tran_scale_ab_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x12_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x12_lib4:
+#endif
+#endif
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vunpcklpd %ymm9, %ymm8, %ymm12
+ vunpckhpd %ymm9, %ymm8, %ymm13
+ vunpcklpd %ymm11, %ymm10, %ymm14
+ vunpckhpd %ymm11, %ymm10, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm8
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm10
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm9
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm11
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 128(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 256(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 288(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 320(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 352(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x12_lib4, .-inner_tran_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_12x4_lib4, @function
+inner_blend_scale_11_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_12x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_12x4_lib4:
+#endif
+#endif
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm12
+ vblendpd $0x5, %ymm1, %ymm0, %ymm13
+ vblendpd $0xa, %ymm3, %ymm2, %ymm14
+ vblendpd $0x5, %ymm3, %ymm2, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm0
+ vblendpd $0x3, %ymm14, %ymm12, %ymm2
+ vblendpd $0xc, %ymm15, %ymm13, %ymm1
+ vblendpd $0x3, %ymm15, %ymm13, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm12
+ vblendpd $0x5, %ymm5, %ymm4, %ymm13
+ vblendpd $0xa, %ymm7, %ymm6, %ymm14
+ vblendpd $0x5, %ymm7, %ymm6, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm4
+ vblendpd $0x3, %ymm14, %ymm12, %ymm6
+ vblendpd $0xc, %ymm15, %ymm13, %ymm5
+ vblendpd $0x3, %ymm15, %ymm13, %ymm7
+
+ vblendpd $0xa, %ymm9, %ymm8, %ymm12
+ vblendpd $0x5, %ymm9, %ymm8, %ymm13
+ vblendpd $0xa, %ymm11, %ymm10, %ymm14
+ vblendpd $0x5, %ymm11, %ymm10, %ymm15
+
+ vblendpd $0xc, %ymm14, %ymm12, %ymm8
+ vblendpd $0x3, %ymm14, %ymm12, %ymm10
+ vblendpd $0xc, %ymm15, %ymm13, %ymm9
+ vblendpd $0x3, %ymm15, %ymm13, %ymm11
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC05(%rip), %ymm14 // beta=1.0
+#else
+ vmovapd LC05(%rip), %ymm14 // beta=1.0
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 0(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 32(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 64(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 96(%r10, %r11, 2), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_12x4_lib4, .-inner_blend_scale_11_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_11_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_11_4x12_lib4, @function
+inner_tran_scale_11_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_11_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_11_4x12_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_11_4x12_lib4:
+#endif
+#endif
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+ vunpcklpd %ymm9, %ymm8, %ymm12
+ vunpckhpd %ymm9, %ymm8, %ymm13
+ vunpcklpd %ymm11, %ymm10, %ymm14
+ vunpckhpd %ymm11, %ymm10, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm8
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm10
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm9
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm11
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC05(%rip), %ymm14 // beta=1.0
+#else
+ vmovapd LC05(%rip), %ymm14 // beta=1.0
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 128(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 256(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 288(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 320(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 352(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_11_4x12_lib4, .-inner_tran_scale_11_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_12x4_vs_lib4, @function
+inner_edge_dpotrf_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_12x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC05(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC05(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ cmpl $2, %r11d
+ jl 0f // ret
+// vperm2f128 $0x00, %ymm0, %ymm0, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ cmpl $4, %r11d
+ jl 0f // ret
+// vperm2f128 $0x11, %ymm2, %ymm2, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+// vextractf128 $0x1, %ymm3, %xmm13
+// vpermilpd $0x3, %xmm13, %xmm13
+ vpermpd $0xff, %ymm3, %ymm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+ #if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_12x4_vs_lib4, .-inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_12x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_12x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_12x4_lib4, .-inner_edge_dtrsm_rlt_inv_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x12_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x12_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x12_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm7
+ vbroadcastsd 0(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm11
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm4
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm7
+ vbroadcastsd 32(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm11
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm4
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm5
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm7
+ vbroadcastsd 64(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm11
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm4
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm5
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm6
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm7
+ vbroadcastsd 96(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm11
+
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 0(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm11
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 32(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm11
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vbroadcastsd 64(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm11
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+ vbroadcastsd 96(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm11
+
+ addq $128, %r10
+
+ vbroadcastsd 64(%r12), %ymm13
+ vmulpd %ymm8, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 72(%r12), %ymm13
+ vmulpd %ymm9, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 80(%r12), %ymm13
+ vmulpd %ymm10, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 88(%r12), %ymm13
+ vmulpd %ymm11, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x12_lib4, .-inner_edge_dtrsm_rlt_inv_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_12x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_12x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_12x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// r13d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// r13d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X12_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x12_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x12_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x12_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x12_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm7
+ vbroadcastsd 0(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm11
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm4
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm7
+ vbroadcastsd 32(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm11
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm4
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm5
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm7
+ vbroadcastsd 64(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm11
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm4
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm5
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm6
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm7
+ vbroadcastsd 96(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm11
+
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 0(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm11
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 32(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm11
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vbroadcastsd 64(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm11
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+ vbroadcastsd 96(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm11
+
+ addq $128, %r10
+
+ vbroadcastsd 64(%r12), %ymm13
+ vmulpd %ymm8, %ymm13, %ymm8
+ cmpl $10, %r13d
+ jl 0f // ret
+ vbroadcastsd 8(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 72(%r12), %ymm13
+ vmulpd %ymm9, %ymm13, %ymm9
+ cmpl $11, %r13d
+ jl 0f // ret
+ vbroadcastsd 48(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 80(%r12), %ymm13
+ vmulpd %ymm10, %ymm13, %ymm10
+ cmpl $12, %r13d
+ jl 0f // ret
+ vbroadcastsd 88(%r10, %r11, 2), %ymm13
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 88(%r12), %ymm13
+ vmulpd %ymm11, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x12_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_12x4_lib4, @function
+inner_edge_dtrsm_rlt_one_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_12x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_12x4_lib4, .-inner_edge_dtrsm_rlt_one_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_12x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_12x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+
+ cmpl $3, %r11d
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+
+ cmpl $4, %r11d
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_12x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_12x4_lib4, @function
+inner_edge_dtrsm_rut_inv_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_12x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vmulpd %ymm11, %ymm12, %ymm11
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm2
+ vfnmadd231pd %ymm7, %ymm12, %ymm6
+ vfnmadd231pd %ymm11, %ymm12, %ymm10
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm1
+ vfnmadd231pd %ymm7, %ymm12, %ymm5
+ vfnmadd231pd %ymm11, %ymm12, %ymm9
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm0
+ vfnmadd231pd %ymm7, %ymm12, %ymm4
+ vfnmadd231pd %ymm11, %ymm12, %ymm8
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vmulpd %ymm10, %ymm12, %ymm10
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm1
+ vfnmadd231pd %ymm6, %ymm12, %ymm5
+ vfnmadd231pd %ymm10, %ymm12, %ymm9
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm0
+ vfnmadd231pd %ymm6, %ymm12, %ymm4
+ vfnmadd231pd %ymm10, %ymm12, %ymm8
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vmulpd %ymm9, %ymm12, %ymm9
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm0
+ vfnmadd231pd %ymm5, %ymm12, %ymm4
+ vfnmadd231pd %ymm9, %ymm12, %ymm8
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+ vmulpd %ymm8, %ymm12, %ymm8
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_12x4_lib4, .-inner_edge_dtrsm_rut_inv_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUN_INV_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_run_inv_12x4_lib4, @function
+inner_edge_dtrsm_run_inv_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_run_inv_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_12x4_lib4:
+#endif
+#endif
+
+ // first column
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+ vmulpd %ymm8, %ymm12, %ymm8
+
+ // second column
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm1
+ vfnmadd231pd %ymm4, %ymm12, %ymm5
+ vfnmadd231pd %ymm8, %ymm12, %ymm9
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vmulpd %ymm9, %ymm12, %ymm9
+
+ // third column
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm2
+ vfnmadd231pd %ymm4, %ymm12, %ymm6
+ vfnmadd231pd %ymm8, %ymm12, %ymm10
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm2
+ vfnmadd231pd %ymm5, %ymm12, %ymm6
+ vfnmadd231pd %ymm9, %ymm12, %ymm10
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vmulpd %ymm10, %ymm12, %ymm10
+
+ // fourth column
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm3
+ vfnmadd231pd %ymm4, %ymm12, %ymm7
+ vfnmadd231pd %ymm8, %ymm12, %ymm11
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm3
+ vfnmadd231pd %ymm5, %ymm12, %ymm7
+ vfnmadd231pd %ymm9, %ymm12, %ymm11
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm3
+ vfnmadd231pd %ymm6, %ymm12, %ymm7
+ vfnmadd231pd %ymm10, %ymm12, %ymm11
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vmulpd %ymm11, %ymm12, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_run_inv_12x4_lib4, .-inner_edge_dtrsm_run_inv_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 d60 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_12x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_12x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vmulpd %ymm11, %ymm12, %ymm11
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm2
+ vfnmadd231pd %ymm7, %ymm12, %ymm6
+ vfnmadd231pd %ymm11, %ymm12, %ymm10
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm1
+ vfnmadd231pd %ymm7, %ymm12, %ymm5
+ vfnmadd231pd %ymm11, %ymm12, %ymm9
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm0
+ vfnmadd231pd %ymm7, %ymm12, %ymm4
+ vfnmadd231pd %ymm11, %ymm12, %ymm8
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vmulpd %ymm10, %ymm12, %ymm10
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm1
+ vfnmadd231pd %ymm6, %ymm12, %ymm5
+ vfnmadd231pd %ymm10, %ymm12, %ymm9
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm0
+ vfnmadd231pd %ymm6, %ymm12, %ymm4
+ vfnmadd231pd %ymm10, %ymm12, %ymm8
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vmulpd %ymm9, %ymm12, %ymm9
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm0
+ vfnmadd231pd %ymm5, %ymm12, %ymm4
+ vfnmadd231pd %ymm9, %ymm12, %ymm8
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+ vmulpd %ymm8, %ymm12, %ymm8
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_12x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LLN_ONE_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lln_one_12x4_lib4, @function
+inner_edge_dtrsm_lln_one_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lln_one_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_12x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r12 // E1 <- E0
+ addq %r11, %r12 // E1 <- E0 + 4*sde*sizeof(double)
+ movq %r12, %r13 // E2 <- E1
+ addq %r11, %r13 // E2 <- E1 + 4*sde*sizeof(double)
+
+ // left block-column
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 0(%r10), %ymm12
+ vblendpd $0x1, %ymm15, %ymm12, %ymm12
+ vmovapd 0(%r12), %ymm14
+ vmovapd 0(%r13), %ymm15
+ vpermpd $0x00, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 32(%r10), %ymm12
+ vblendpd $0x3, %ymm15, %ymm12, %ymm12
+ vmovapd 32(%r12), %ymm14
+ vmovapd 32(%r13), %ymm15
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0x55, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 64(%r10), %ymm12
+ vblendpd $0x7, %ymm15, %ymm12, %ymm12
+ vmovapd 64(%r12), %ymm14
+ vmovapd 64(%r13), %ymm15
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0xaa, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0xaa, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vmovapd 96(%r12), %ymm14
+ vmovapd 96(%r13), %ymm15
+ vpermpd $0xff, %ymm0, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0xff, %ymm1, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0xff, %ymm3, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ addq $128, %r12
+ addq $128, %r13
+
+
+ // middle block-column
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 0(%r12), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 0(%r13), %ymm14
+ vpermpd $0x00, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0x00, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0x00, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0x00, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 32(%r12), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm14
+ vpermpd $0x55, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0x55, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0x55, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0x55, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 64(%r12), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 64(%r13), %ymm14
+ vpermpd $0xaa, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0xaa, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0xaa, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0xaa, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+ vmovapd 96(%r13), %ymm14
+ vpermpd $0xff, %ymm4, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0xff, %ymm5, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0xff, %ymm6, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0xff, %ymm7, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+
+ addq $128, %r13
+
+
+ // right block-column
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r13), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vpermpd $0x00, %ymm8, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vpermpd $0x00, %ymm9, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vpermpd $0x00, %ymm10, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vpermpd $0x00, %ymm11, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+ vmovapd 32(%r13), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vpermpd $0x55, %ymm8, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vpermpd $0x55, %ymm9, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vpermpd $0x55, %ymm10, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vpermpd $0x55, %ymm11, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+ vmovapd 64(%r13), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vpermpd $0xaa, %ymm8, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vpermpd $0xaa, %ymm9, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vpermpd $0xaa, %ymm10, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vpermpd $0xaa, %ymm11, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lln_one_12x4_lib4, .-inner_edge_dtrsm_lln_one_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_12x4_lib4, @function
+inner_edge_dtrsm_lun_inv_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_12x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r13 // E1 <- E0
+ addq %r11, %r13 // E1 <- E0 + 4*sde*sizeof(double)
+ movq %r13, %r14 // E2 <- E1
+ addq %r11, %r14 // E2 <- E1 + 4*sde*sizeof(double)
+
+ // bottom-right
+
+ vmovapd 352(%r14), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 88(%r12), %ymm12
+ vmovapd 352(%r13), %ymm15
+// vmovapd 352(%r10), %ymm11
+
+ vpermpd $0xff, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm13, %ymm14, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 352(%r10), %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm13, %ymm14, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 352(%r10), %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm13, %ymm14, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 352(%r10), %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm13, %ymm14, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 352(%r10), %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 320(%r14), %xmm13
+ vbroadcastsd 80(%r12), %ymm12
+ vmovapd 320(%r13), %ymm15
+// vmovapd 320(%r10), %ymm11
+
+ vpermpd $0xaa, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm13, %ymm14, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 320(%r10), %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm13, %ymm14, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 320(%r10), %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm13, %ymm14, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 320(%r10), %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm13, %ymm14, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 320(%r10), %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 288(%r14), %xmm13
+ vbroadcastsd 72(%r12), %ymm12
+ vmovapd 288(%r13), %ymm15
+// vmovapd 288(%r10), %ymm11
+
+ vpermpd $0x55, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm13, %ymm14, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 288(%r10), %ymm14, %ymm0
+
+ vpermpd $0x55, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm13, %ymm14, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 288(%r10), %ymm14, %ymm1
+
+ vpermpd $0x55, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm13, %ymm14, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 288(%r10), %ymm14, %ymm2
+
+ vpermpd $0x55, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm13, %ymm14, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 288(%r10), %ymm14, %ymm3
+
+
+ vbroadcastsd 64(%r12), %ymm12
+ vmovapd 256(%r13), %ymm15
+// vmovapd 256(%r10), %ymm11
+
+ vpermpd $0x00, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 256(%r10), %ymm14, %ymm0
+
+ vpermpd $0x00, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 256(%r10), %ymm14, %ymm1
+
+ vpermpd $0x00, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 256(%r10), %ymm14, %ymm2
+
+ vpermpd $0x00, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 256(%r10), %ymm14, %ymm3
+
+
+ // middle-middle
+
+ vmovapd 224(%r13), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm15
+
+ vpermpd $0xff, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r13), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm15
+
+ vpermpd $0xaa, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r13), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm15
+
+ vpermpd $0x55, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0x55, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0x55, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0x55, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm15
+
+ vpermpd $0x00, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0x00, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0x00, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0x00, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vpermpd $0xff, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vpermpd $0xaa, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_12x4_lib4, .-inner_edge_dtrsm_lun_inv_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_12x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_12x4_vs_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // E1 <- E0
+ addq %r11, %r15 // E1 <- E0 + 4*sde*sizeof(double)
+ movq %r15, %r14 // E2 <- E1
+ addq %r11, %r14 // E2 <- E1 + 4*sde*sizeof(double)
+
+ // bottom-right
+
+ cmpl $11, %r13d
+ jle 0f
+
+ vmovapd 352(%r14), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 88(%r12), %ymm12
+ vmovapd 352(%r15), %ymm15
+// vmovapd 352(%r10), %ymm11
+
+ vpermpd $0xff, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm13, %ymm14, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 352(%r10), %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm13, %ymm14, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 352(%r10), %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm13, %ymm14, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 352(%r10), %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm13, %ymm14, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 352(%r10), %ymm14, %ymm3
+
+0:
+ cmpl $10, %r13d
+ jle 1f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 320(%r14), %xmm13
+ vbroadcastsd 80(%r12), %ymm12
+ vmovapd 320(%r15), %ymm15
+// vmovapd 320(%r10), %ymm11
+
+ vpermpd $0xaa, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm13, %ymm14, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 320(%r10), %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm13, %ymm14, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 320(%r10), %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm13, %ymm14, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 320(%r10), %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm13, %ymm14, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 320(%r10), %ymm14, %ymm3
+
+1:
+ cmpl $9, %r13d
+ jle 2f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 288(%r14), %xmm13
+ vbroadcastsd 72(%r12), %ymm12
+ vmovapd 288(%r15), %ymm15
+// vmovapd 288(%r10), %ymm11
+
+ vpermpd $0x55, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm13, %ymm14, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 288(%r10), %ymm14, %ymm0
+
+ vpermpd $0x55, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm13, %ymm14, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 288(%r10), %ymm14, %ymm1
+
+ vpermpd $0x55, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm13, %ymm14, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 288(%r10), %ymm14, %ymm2
+
+ vpermpd $0x55, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm13, %ymm14, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 288(%r10), %ymm14, %ymm3
+
+2:
+
+ vbroadcastsd 64(%r12), %ymm12
+ vmovapd 256(%r15), %ymm15
+// vmovapd 256(%r10), %ymm11
+
+ vpermpd $0x00, %ymm8, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm8, %ymm8
+ vfnmadd231pd %ymm15, %ymm14, %ymm4
+ vfnmadd231pd 256(%r10), %ymm14, %ymm0
+
+ vpermpd $0x00, %ymm9, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm9, %ymm9
+ vfnmadd231pd %ymm15, %ymm14, %ymm5
+ vfnmadd231pd 256(%r10), %ymm14, %ymm1
+
+ vpermpd $0x00, %ymm10, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm10, %ymm10
+ vfnmadd231pd %ymm15, %ymm14, %ymm6
+ vfnmadd231pd 256(%r10), %ymm14, %ymm2
+
+ vpermpd $0x00, %ymm11, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm11, %ymm11
+ vfnmadd231pd %ymm15, %ymm14, %ymm7
+ vfnmadd231pd 256(%r10), %ymm14, %ymm3
+
+
+ // middle-middle
+
+ vmovapd 224(%r15), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm15
+
+ vpermpd $0xff, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r15), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm15
+
+ vpermpd $0xaa, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r15), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm15
+
+ vpermpd $0x55, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0x55, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0x55, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0x55, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm15
+
+ vpermpd $0x00, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm15, %ymm14, %ymm0
+
+ vpermpd $0x00, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm15, %ymm14, %ymm1
+
+ vpermpd $0x00, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm15, %ymm14, %ymm2
+
+ vpermpd $0x00, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm15, %ymm14, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vpermpd $0xff, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vpermpd $0xaa, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_12x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// left kernel
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_L_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_l_12x4_lib4, @function
+inner_edge_dgetrf_l_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_l_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_l_12x4_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC05(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC05(%rip), %xmm14 // 1.0
+#endif
+// vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm0, %ymm12, %ymm12
+ vmovapd %ymm0, %ymm12
+ vdivsd %xmm0, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r10)
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ vblendpd $0x1, %ymm12, %ymm0, %ymm0
+
+ // second column
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vblendpd $0x2, %ymm1, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r10)
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ vblendpd $0x3, %ymm12, %ymm1, %ymm1
+
+ // third column
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vblendpd $0x2, %ymm2, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vblendpd $0x4, %ymm2, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm2, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r10)
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ vblendpd $0x7, %ymm12, %ymm2, %ymm2
+
+ // fourth column
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+ vblendpd $0x2, %ymm3, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+ vblendpd $0x4, %ymm3, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+ vblendpd $0x8, %ymm3, %ymm12, %ymm12
+
+ vpermpd $0xff, %ymm3, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r10)
+// vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+ vblendpd $0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_l_12x4_lib4, .-inner_edge_dgetrf_l_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// middle kernel
+//
+// input arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_M_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_m_12x4_lib4, @function
+inner_edge_dgetrf_m_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_m_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_m_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_m_12x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r14 // E1 <- E0
+ addq %r11, %r14 // E1 <- E0 + 4*sde*sizeof(double)
+ movq %r14, %r13 // E2 <- E1
+ addq %r11, %r13 // E2 <- E1 + 4*sde*sizeof(double)
+
+ // solve upper 4x4 & correct lower 8x4
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 0(%r10), %ymm12
+ vblendpd $0x1, %ymm15, %ymm12, %ymm12
+ vmovapd 0(%r14), %ymm14
+ vmovapd 0(%r13), %ymm15
+ vpermpd $0x00, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 32(%r10), %ymm12
+ vblendpd $0x3, %ymm15, %ymm12, %ymm12
+ vmovapd 32(%r14), %ymm14
+ vmovapd 32(%r13), %ymm15
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0x55, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 64(%r10), %ymm12
+ vblendpd $0x7, %ymm15, %ymm12, %ymm12
+ vmovapd 64(%r14), %ymm14
+ vmovapd 64(%r13), %ymm15
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0xaa, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0xaa, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vmovapd 96(%r14), %ymm14
+ vmovapd 96(%r13), %ymm15
+ vpermpd $0xff, %ymm0, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0xff, %ymm1, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0xff, %ymm3, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+
+ // factorize lower 8x4
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC05(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC05(%rip), %xmm14 // 1.0
+#endif
+// vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm4, %ymm12, %ymm12
+ vmovapd %ymm4, %ymm12
+ vdivsd %xmm4, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r12)
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ vblendpd $0x1, %ymm12, %ymm4, %ymm4
+
+ // second column
+ vpermpd $0x00, %ymm5, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vblendpd $0x2, %ymm5, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm5, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r12)
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ vblendpd $0x3, %ymm12, %ymm5, %ymm5
+
+ // third column
+ vpermpd $0x00, %ymm6, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vblendpd $0x2, %ymm6, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm6, %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vblendpd $0x4, %ymm6, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm6, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r12)
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ vblendpd $0x7, %ymm12, %ymm6, %ymm6
+
+ // fourth column
+ vpermpd $0x00, %ymm7, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+ vblendpd $0x2, %ymm7, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm7, %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+ vblendpd $0x4, %ymm7, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm7, %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+ vblendpd $0x8, %ymm7, %ymm12, %ymm12
+
+ vpermpd $0xff, %ymm7, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r12)
+// vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+ vblendpd $0x7, %ymm12, %ymm7, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_m_12x4_lib4, .-inner_edge_dgetrf_m_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// right kernel
+//
+// input arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_R_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_r_12x4_lib4, @function
+inner_edge_dgetrf_r_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_r_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_r_12x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_r_12x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r14 // E1 <- E0
+ addq %r11, %r14 // E1 <- E0 + 4*sde*sizeof(double)
+ movq %r14, %r13 // E2 <- E1
+ addq %r11, %r13 // E2 <- E1 + 4*sde*sizeof(double)
+
+ // solve upper 8x4 & correct lower 4x4
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 0(%r10), %ymm12
+ vblendpd $0x1, %ymm15, %ymm12, %ymm12
+ vmovapd 0(%r14), %ymm14
+ vmovapd 0(%r13), %ymm15
+ vpermpd $0x00, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 32(%r10), %ymm12
+ vblendpd $0x3, %ymm15, %ymm12, %ymm12
+ vmovapd 32(%r14), %ymm14
+ vmovapd 32(%r13), %ymm15
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0x55, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vmovapd 64(%r10), %ymm12
+ vblendpd $0x7, %ymm15, %ymm12, %ymm12
+ vmovapd 64(%r14), %ymm14
+ vmovapd 64(%r13), %ymm15
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0xaa, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0xaa, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ vmovapd 96(%r14), %ymm14
+ vmovapd 96(%r13), %ymm15
+ vpermpd $0xff, %ymm0, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vfnmadd231pd %ymm15, %ymm13, %ymm8
+ vpermpd $0xff, %ymm1, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vfnmadd231pd %ymm15, %ymm13, %ymm9
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vfnmadd231pd %ymm15, %ymm13, %ymm10
+ vpermpd $0xff, %ymm3, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+ vfnmadd231pd %ymm15, %ymm13, %ymm11
+
+ addq $128, %r14
+ addq $128, %r13
+
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 0(%r14), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 0(%r13), %ymm14
+ vpermpd $0x00, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0x00, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0x00, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0x00, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 32(%r14), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm14
+ vpermpd $0x55, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0x55, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0x55, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0x55, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 64(%r14), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 64(%r13), %ymm14
+ vpermpd $0xaa, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0xaa, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0xaa, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0xaa, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+ vmovapd 96(%r13), %ymm14
+ vpermpd $0xff, %ymm4, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm8
+ vpermpd $0xff, %ymm5, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm9
+ vpermpd $0xff, %ymm6, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm10
+ vpermpd $0xff, %ymm7, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm11
+
+
+
+ // factorize lower 8x4
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC05(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC05(%rip), %xmm14 // 1.0
+#endif
+// vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm8, %ymm12, %ymm12
+ vmovapd %ymm8, %ymm12
+ vdivsd %xmm8, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r12)
+ vmulpd %ymm8, %ymm13, %ymm8
+ vblendpd $0x1, %ymm12, %ymm8, %ymm8
+
+ // second column
+ vpermpd $0x00, %ymm9, %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vblendpd $0x2, %ymm9, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm9, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r12)
+ vmulpd %ymm9, %ymm13, %ymm9
+ vblendpd $0x3, %ymm12, %ymm9, %ymm9
+
+ // third column
+ vpermpd $0x00, %ymm10, %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vblendpd $0x2, %ymm10, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm10, %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vblendpd $0x4, %ymm10, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm10, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r12)
+ vmulpd %ymm10, %ymm13, %ymm10
+ vblendpd $0x7, %ymm12, %ymm10, %ymm10
+
+ // fourth column
+ vpermpd $0x00, %ymm11, %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+ vblendpd $0x2, %ymm11, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm11, %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+ vblendpd $0x4, %ymm11, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm11, %ymm13
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+ vblendpd $0x8, %ymm11, %ymm12, %ymm12
+
+ vpermpd $0xff, %ymm11, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+// vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r12)
+// vmulpd %ymm11, %ymm13, %ymm11
+ vblendpd $0x7, %ymm12, %ymm11, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_r_12x4_lib4, .-inner_edge_dgetrf_r_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_12x4_lib4, @function
+inner_store_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_12x4_lib4; .scl 2; .type 32; .endef
+inner_store_12x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+ vmovapd %ymm8, 0(%r10, %r11, 2)
+ vmovapd %ymm9, 32(%r10, %r11, 2)
+ vmovapd %ymm10, 64(%r10, %r11, 2)
+ vmovapd %ymm11, 96(%r10, %r11, 2)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_12x4_lib4, .-inner_store_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X12_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x12_lib4, @function
+inner_store_4x12_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x12_lib4; .scl 2; .type 32; .endef
+inner_store_4x12_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 128(%r10)
+ vmovapd %ymm5, 160(%r10)
+ vmovapd %ymm6, 192(%r10)
+ vmovapd %ymm7, 224(%r10)
+
+ vmovapd %ymm8, 256(%r10)
+ vmovapd %ymm9, 288(%r10)
+ vmovapd %ymm10, 320(%r10)
+ vmovapd %ymm11, 352(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x12_lib4, .-inner_store_4x12_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_12x4_vs_lib4, @function
+inner_store_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_12x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC04(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmaskmovpd %ymm8, %ymm15, 0(%r10, %r11, 2)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmaskmovpd %ymm9, %ymm15, 32(%r10, %r11, 2)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmaskmovpd %ymm10, %ymm15, 64(%r10, %r11, 2)
+ je 0f // end
+ vmovapd %ymm3, 96(%r10)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+ vmaskmovpd %ymm11, %ymm15, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_12x4_vs_lib4, .-inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X12_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x12_vs_lib4, @function
+inner_store_4x12_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x12_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x12_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+ vmaskmovpd %ymm4, %ymm15, 128(%r10)
+ vmaskmovpd %ymm5, %ymm15, 160(%r10)
+ vmaskmovpd %ymm6, %ymm15, 192(%r10)
+ vmaskmovpd %ymm7, %ymm15, 224(%r10)
+
+ vmaskmovpd %ymm8, %ymm15, 256(%r10)
+ cmpl $10, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm9, %ymm15, 288(%r10)
+ cmpl $11, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm10, %ymm15, 320(%r10)
+ je 0f // end
+ vmaskmovpd %ymm11, %ymm15, 352(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x12_vs_lib4, .-inner_store_4x12_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_lib4, @function
+inner_store_l_12x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_12x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+ vmovapd %ymm8, 0(%r10, %r11, 2)
+ vmovapd %ymm9, 32(%r10, %r11, 2)
+ vmovapd %ymm10, 64(%r10, %r11, 2)
+ vmovapd %ymm11, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_lib4, .-inner_store_l_12x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_vs_lib4, @function
+inner_store_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_12x4_vs_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+ movq %r15, %r14 // D2 <- D1
+ addq %r11, %r14 // D2 <- D1 + 4*sdd*sizeof(double)
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC04(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ cmpl $2, %r13d
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm4, 0(%r15)
+ vmaskmovpd %ymm8, %ymm15, 0(%r14)
+ jl 0f // end
+ cmpl $3, %r13d
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm5, 32(%r15)
+ vmaskmovpd %ymm9, %ymm15, 32(%r14)
+ jl 0f // end
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm6, 64(%r15)
+ vmaskmovpd %ymm10, %ymm15, 64(%r14)
+ je 0f // end
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+ vmovapd %ymm7, 96(%r15)
+ vmaskmovpd %ymm11, %ymm15, 96(%r14)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_vs_lib4, .-inner_store_l_12x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemm_nt_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_12x4_lib4
+ .type kernel_dgemm_nt_12x4_lib4, @function
+kernel_dgemm_nt_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_12x4_lib4
+_kernel_dgemm_nt_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_12x4_lib4
+ .def kernel_dgemm_nt_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_12x4_lib4, .-kernel_dgemm_nt_12x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dgemm_nt_4x12_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x12_lib4
+ .type kernel_dgemm_nt_4x12_lib4, @function
+kernel_dgemm_nt_4x12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x12_lib4
+_kernel_dgemm_nt_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x12_lib4
+ .def kernel_dgemm_nt_4x12_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x12_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x12_lib4, .-kernel_dgemm_nt_4x12_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nt_12x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_12x4_vs_lib4
+ .type kernel_dgemm_nt_12x4_vs_lib4, @function
+kernel_dgemm_nt_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_12x4_vs_lib4
+_kernel_dgemm_nt_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_12x4_vs_lib4
+ .def kernel_dgemm_nt_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_12x4_vs_lib4, .-kernel_dgemm_nt_12x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemm_nt_4x12_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x12_vs_lib4
+ .type kernel_dgemm_nt_4x12_vs_lib4, @function
+kernel_dgemm_nt_4x12_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x12_vs_lib4
+_kernel_dgemm_nt_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x12_vs_lib4
+ .def kernel_dgemm_nt_4x12_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x12_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x12_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x12_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x12_vs_lib4, .-kernel_dgemm_nt_4x12_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nn_12x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_12x4_lib4
+ .type kernel_dgemm_nn_12x4_lib4, @function
+kernel_dgemm_nn_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_12x4_lib4
+_kernel_dgemm_nn_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_12x4_lib4
+ .def kernel_dgemm_nn_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_12x4_lib4, .-kernel_dgemm_nn_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nn_4x12_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x12_lib4
+ .type kernel_dgemm_nn_4x12_lib4, @function
+kernel_dgemm_nn_4x12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x12_lib4
+_kernel_dgemm_nn_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x12_lib4
+ .def kernel_dgemm_nn_4x12_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x12_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x12_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x12_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x12_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x12_lib4, .-kernel_dgemm_nn_4x12_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dgemm_nn_12x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_12x4_vs_lib4
+ .type kernel_dgemm_nn_12x4_vs_lib4, @function
+kernel_dgemm_nn_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_12x4_vs_lib4
+_kernel_dgemm_nn_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_12x4_vs_lib4
+ .def kernel_dgemm_nn_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ movq ARG6, %r14 // sda
+ sall $5, %r14d // 4*sda*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // store address D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_12x4_vs_lib4, .-kernel_dgemm_nn_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dsyrk_nt_l_12x4_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_12x4_lib4
+ .type kernel_dsyrk_nt_l_12x4_lib4, @function
+kernel_dsyrk_nt_l_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_12x4_lib4
+_kernel_dsyrk_nt_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_12x4_lib4
+ .def kernel_dsyrk_nt_l_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_12x4_lib4, .-kernel_dsyrk_nt_l_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dsyrk_nt_l_12x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_12x4_vs_lib4
+ .type kernel_dsyrk_nt_l_12x4_vs_lib4, @function
+kernel_dsyrk_nt_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_12x4_vs_lib4
+_kernel_dsyrk_nt_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_12x4_vs_lib4
+ .def kernel_dsyrk_nt_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_12x4_vs_lib4, .-kernel_dsyrk_nt_l_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nn_rl_12x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_12x4_lib4
+ .type kernel_dtrmm_nn_rl_12x4_lib4, @function
+kernel_dtrmm_nn_rl_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_12x4_lib4
+_kernel_dtrmm_nn_rl_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_12x4_lib4
+ .def kernel_dtrmm_nn_rl_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_12x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_12x4_lib4, .-kernel_dtrmm_nn_rl_12x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dtrmm_nn_rl_12x4_vs_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_12x4_vs_lib4
+ .type kernel_dtrmm_nn_rl_12x4_vs_lib4, @function
+kernel_dtrmm_nn_rl_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_12x4_vs_lib4
+_kernel_dtrmm_nn_rl_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_12x4_vs_lib4
+ .def kernel_dtrmm_nn_rl_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_12x4_vs_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdb*sizeof(double)
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_12x4_vs_lib4, .-kernel_dtrmm_nn_rl_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrmm_nt_ru_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_12x4_lib4
+ .type kernel_dtrmm_nt_ru_12x4_lib4, @function
+kernel_dtrmm_nt_ru_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_12x4_lib4
+_kernel_dtrmm_nt_ru_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_12x4_lib4
+ .def kernel_dtrmm_nt_ru_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+// call inner blend
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_12x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_12x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10 // A
+ movq ARG4, %r11 // sda
+ sall $5, %r11d // 4*sda*sizeof(double)
+ movq ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_12x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_12x4_lib4, .-kernel_dtrmm_nt_ru_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrmm_nt_ru_12x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_12x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_12x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_12x4_vs_lib4
+_kernel_dtrmm_nt_ru_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_12x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+// call inner blend
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_12x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_12x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10 // A
+ movq ARG4, %r11 // sda
+ sall $5, %r11d // 4*sda*sizeof(double)
+ movq ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_12x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_12x4_vs_lib4, .-kernel_dtrmm_nt_ru_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dpotrf_nt_l_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_12x4_lib4
+ .type kernel_dpotrf_nt_l_12x4_lib4, @function
+kernel_dpotrf_nt_l_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_12x4_lib4
+_kernel_dpotrf_nt_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_12x4_lib4
+ .def kernel_dpotrf_nt_l_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_12x4_lib4, .-kernel_dpotrf_nt_l_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dpotrf_nt_l_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_12x4_vs_lib4
+ .type kernel_dpotrf_nt_l_12x4_vs_lib4, @function
+kernel_dpotrf_nt_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_12x4_vs_lib4
+_kernel_dpotrf_nt_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_12x4_vs_lib4
+ .def kernel_dpotrf_nt_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_12x4_vs_lib4, .-kernel_dpotrf_nt_l_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dsyrk_dpotrf_nt_l_12x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_12x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_12x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_12x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_12x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_12x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_12x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_12x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_12x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_12x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_12x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x12_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x12_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x12_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x12_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x12_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // B
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 32*sdb
+ movq ARG2, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x12_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG9, %r12 // inv_diag_E
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG6, %r10 // store address D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x12_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x12_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x12_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x12_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+// void kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4(int kp, double *Ap, double *Bp, int sdbp, int km, double *Am, double *Bm, int sdbm, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG3, %r11 // Bp
+ movq ARG4, %r12 // sdbp
+ sall $5, %r12d // 32*sdbp
+ movq ARG2, %r13 // Ap
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG7, %r11 // Bm
+ movq ARG8, %r12 // sdbm
+ sall $5, %r12d // 32*sdbm
+ movq ARG6, %r13 // Am
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x12_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG11, %r10 // E
+ movq ARG12, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG13, %r12 // inv_diag_E
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x12_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // store address D
+ movq ARG14, %r11 // km
+ movq ARG15, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X12_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x12_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x12_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_rl_inv_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_12x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_12x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_12x4_lib4
+_kernel_dtrsm_nt_rl_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_12x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_12x4_lib4, .-kernel_dtrsm_nt_rl_inv_12x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dtrsm_nt_rl_inv_4x12_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x12_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x12_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x12_lib4
+_kernel_dtrsm_nt_rl_inv_4x12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x12_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x12_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG3, %r11
+ movq ARG4, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG2, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x12_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG9, %r12 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x12_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG6, %r10 // store address D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x12_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x12_lib4, .-kernel_dtrsm_nt_rl_inv_4x12_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_one_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_12x4_lib4
+ .type kernel_dtrsm_nt_rl_one_12x4_lib4, @function
+kernel_dtrsm_nt_rl_one_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_12x4_lib4
+_kernel_dtrsm_nt_rl_one_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_12x4_lib4
+ .def kernel_dtrsm_nt_rl_one_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_12x4_lib4, .-kernel_dtrsm_nt_rl_one_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nt_rl_one_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_12x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_one_12x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_12x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_12x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_one_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_12x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_12x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_ru_inv_12x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_12x4_lib4
+ .type kernel_dtrsm_nt_ru_inv_12x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_12x4_lib4
+_kernel_dtrsm_nt_ru_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_12x4_lib4
+ .def kernel_dtrsm_nt_ru_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_12x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_12x4_lib4, .-kernel_dtrsm_nt_ru_inv_12x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_12x4_vs_lib4
+ .type kernel_dtrsm_nt_ru_inv_12x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_12x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_12x4_vs_lib4
+ .def kernel_dtrsm_nt_ru_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_12x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ru_inv_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_12x4_lib4
+ .type kernel_dtrsm_nn_ru_inv_12x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_12x4_lib4
+_kernel_dtrsm_nn_ru_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_12x4_lib4
+ .def kernel_dtrsm_nn_ru_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_12x4_lib4, .-kernel_dtrsm_nn_ru_inv_12x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dtrsm_nn_ru_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_12x4_vs_lib4
+ .type kernel_dtrsm_nn_ru_inv_12x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_12x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_12x4_vs_lib4
+ .def kernel_dtrsm_nn_ru_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_12x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ll_one_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_12x4_lib4
+ .type kernel_dtrsm_nn_ll_one_12x4_lib4, @function
+kernel_dtrsm_nn_ll_one_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_12x4_lib4
+_kernel_dtrsm_nn_ll_one_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_12x4_lib4
+ .def kernel_dtrsm_nn_ll_one_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_12x4_lib4, .-kernel_dtrsm_nn_ll_one_12x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 tsp+56
+// void kernel_dtrsm_nn_ll_one_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_12x4_vs_lib4
+ .type kernel_dtrsm_nn_ll_one_12x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_12x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_12x4_vs_lib4
+ .def kernel_dtrsm_nn_ll_one_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_12x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_12x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nn_lu_inv_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_12x4_lib4
+ .type kernel_dtrsm_nn_lu_inv_12x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_12x4_lib4
+_kernel_dtrsm_nn_lu_inv_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_12x4_lib4
+ .def kernel_dtrsm_nn_lu_inv_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_12x4_lib4, .-kernel_dtrsm_nn_lu_inv_12x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dtrsm_nn_lu_inv_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E), int km, int kn;
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_12x4_vs_lib4
+ .type kernel_dtrsm_nn_lu_inv_12x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_12x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_12x4_vs_lib4
+ .def kernel_dtrsm_nn_lu_inv_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+ movq ARG13, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_12X4_vs_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_12x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_12x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_12x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgetrf_nn_l_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_12x4_lib4
+ .type kernel_dgetrf_nn_l_12x4_lib4, @function
+kernel_dgetrf_nn_l_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_12x4_lib4
+_kernel_dgetrf_nn_l_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_12x4_lib4
+ .def kernel_dgetrf_nn_l_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_12x4_lib4, .-kernel_dgetrf_nn_l_12x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgetrf_nn_l_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_12x4_vs_lib4
+ .type kernel_dgetrf_nn_l_12x4_vs_lib4, @function
+kernel_dgetrf_nn_l_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_12x4_vs_lib4
+_kernel_dgetrf_nn_l_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_12x4_vs_lib4
+ .def kernel_dgetrf_nn_l_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_12x4_vs_lib4, .-kernel_dgetrf_nn_l_12x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgetrf_nn_m_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_m_12x4_lib4
+ .type kernel_dgetrf_nn_m_12x4_lib4, @function
+kernel_dgetrf_nn_m_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_m_12x4_lib4
+_kernel_dgetrf_nn_m_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_m_12x4_lib4
+ .def kernel_dgetrf_nn_m_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_m_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG8, %r10 // D
+ subq $128, %r10 // E
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_M_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_m_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_m_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_m_12x4_lib4, .-kernel_dgetrf_nn_m_12x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgetrf_nn_m_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_m_12x4_vs_lib4
+ .type kernel_dgetrf_nn_m_12x4_vs_lib4, @function
+kernel_dgetrf_nn_m_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_m_12x4_vs_lib4
+_kernel_dgetrf_nn_m_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_m_12x4_vs_lib4
+ .def kernel_dgetrf_nn_m_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_m_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG8, %r10 // D
+ subq $128, %r10 // E
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_M_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_m_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_m_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_m_12x4_vs_lib4, .-kernel_dgetrf_nn_m_12x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgetrf_nn_r_12x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_r_12x4_lib4
+ .type kernel_dgetrf_nn_r_12x4_lib4, @function
+kernel_dgetrf_nn_r_12x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_r_12x4_lib4
+_kernel_dgetrf_nn_r_12x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_r_12x4_lib4
+ .def kernel_dgetrf_nn_r_12x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_r_12x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG8, %r10 // D
+ subq $256, %r10 // E
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_R_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_r_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_r_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_r_12x4_lib4, .-kernel_dgetrf_nn_r_12x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgetrf_nn_r_12x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_r_12x4_vs_lib4
+ .type kernel_dgetrf_nn_r_12x4_vs_lib4, @function
+kernel_dgetrf_nn_r_12x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_r_12x4_vs_lib4
+_kernel_dgetrf_nn_r_12x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_r_12x4_vs_lib4
+ .def kernel_dgetrf_nn_r_12x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_r_12x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_12x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_12x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG8, %r10 // D
+ subq $256, %r10 // E
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_R_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_r_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_r_12x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_12X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_12x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_12x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_r_12x4_vs_lib4, .-kernel_dgetrf_nn_r_12x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dlarfb12_r_4_lib4(int kmax, double *pV, int sdd, double *pT, double *pD, double *pK, int km);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb12_r_4_lib4
+ .type kernel_dlarfb12_r_4_lib4, @function
+kernel_dlarfb12_r_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb12_r_4_lib4
+_kernel_dlarfb12_r_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb12_r_4_lib4
+ .def kernel_dlarfb12_r_4_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb12_r_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+// vxorpd %ymm0, %ymm0, %ymm0
+// vmovapd %ymm0, %ymm1
+// vmovapd %ymm0, %ymm2
+// vmovapd %ymm0, %ymm3
+// vmovapd %ymm0, %ymm4
+// vmovapd %ymm0, %ymm5
+// vmovapd %ymm0, %ymm6
+// vmovapd %ymm0, %ymm7
+// vmovapd %ymm0, %ymm8
+// vmovapd %ymm0, %ymm9
+// vmovapd %ymm0, %ymm10
+// vmovapd %ymm0, %ymm11
+
+ movq ARG1, %r10 // k
+ movq ARG5, %r11 // D
+ movq ARG2, %r12 // V
+ movq ARG3, %r13 // sdd
+ sall $5, %r13d
+
+ //
+ vmovapd 0(%r11), %ymm12
+ vmovapd %ymm12, %ymm0
+ //
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vmovapd %ymm12, %ymm1
+ //
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vmovapd %ymm12, %ymm2
+ //
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vmovapd %ymm12, %ymm3
+ //
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+ //
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vmovapd %ymm12, %ymm4
+ //
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vmovapd %ymm12, %ymm5
+ //
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vmovapd %ymm12, %ymm6
+ //
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vmovapd %ymm12, %ymm7
+ //
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+ //
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 8(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 16(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 24(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 0(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 8(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 16(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 24(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vmovapd %ymm12, %ymm8
+ //
+ vmovapd 32(%r11), %ymm12
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 40(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 48(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 56(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 32(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 40(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 48(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 56(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 32(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vmovapd %ymm12, %ymm9
+ //
+ vmovapd 64(%r11), %ymm12
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 80(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 88(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 64(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 72(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 80(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 88(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 64(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 72(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vmovapd %ymm12, %ymm10
+ //
+ vmovapd 96(%r11), %ymm12
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 120(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 96(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 104(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 112(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 120(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 96(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 104(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 112(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vmovapd %ymm12, %ymm11
+ //
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ movq %r11, %r14
+ movq %r12, %r11
+ movq %r13, %r12
+ movq %r14, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_TRAN_12X4_LIB4
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+ INNER_TRAN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_12x4_lib4
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+ call inner_tran_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_12x4_lib4
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+ callq _inner_tran_12x4_lib4
+#endif
+#endif
+
+ movq ARG4, %r11 // T
+ movq $384, %r12 // sdt !!!!!!!!!!!!!!!!!!!!!!!!!
+
+ //
+ vbroadcastsd 376(%r11, %r12, 2), %ymm13
+ vmulpd %ymm11, %ymm13, %ymm11
+ //
+ vbroadcastsd 368(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm10, %ymm13, %ymm11
+ vbroadcastsd 336(%r11, %r12, 2), %ymm13
+ vmulpd %ymm10, %ymm13, %ymm10
+ //
+ vbroadcastsd 360(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm9, %ymm13, %ymm11
+ vbroadcastsd 328(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 296(%r11, %r12, 2), %ymm13
+ vmulpd %ymm9, %ymm13, %ymm9
+ //
+ vbroadcastsd 352(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm8, %ymm13, %ymm11
+ vbroadcastsd 320(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 288(%r11, %r12, 2), %ymm13
+ vfmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 256(%r11, %r12, 2), %ymm13
+ vmulpd %ymm8, %ymm13, %ymm8
+ //
+ vbroadcastsd 376(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm7, %ymm13, %ymm11
+ vbroadcastsd 344(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm7, %ymm13, %ymm10
+ vbroadcastsd 312(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm7, %ymm13, %ymm9
+ vbroadcastsd 280(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm7, %ymm13, %ymm8
+ vbroadcastsd 248(%r11, %r12, 1), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+ //
+ vbroadcastsd 368(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm6, %ymm13, %ymm11
+ vbroadcastsd 336(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm6, %ymm13, %ymm10
+ vbroadcastsd 304(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm6, %ymm13, %ymm9
+ vbroadcastsd 272(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm6, %ymm13, %ymm8
+ vbroadcastsd 240(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm6, %ymm13, %ymm7
+ vbroadcastsd 208(%r11, %r12, 1), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ //
+ vbroadcastsd 360(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm5, %ymm13, %ymm11
+ vbroadcastsd 328(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm5, %ymm13, %ymm10
+ vbroadcastsd 296(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm5, %ymm13, %ymm9
+ vbroadcastsd 264(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm5, %ymm13, %ymm8
+ vbroadcastsd 232(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 200(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 168(%r11, %r12, 1), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ //
+ vbroadcastsd 352(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm11
+ vbroadcastsd 320(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm10
+ vbroadcastsd 288(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm9
+ vbroadcastsd 256(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm8
+ vbroadcastsd 224(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 192(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 160(%r11, %r12, 1), %ymm13
+ vfmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 128(%r11, %r12, 1), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ //
+ vbroadcastsd 376(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm11
+ vbroadcastsd 344(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm10
+ vbroadcastsd 312(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm9
+ vbroadcastsd 280(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm8
+ vbroadcastsd 248(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm7
+ vbroadcastsd 216(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm6
+ vbroadcastsd 184(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm5
+ vbroadcastsd 152(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm4
+ vbroadcastsd 120(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ //
+ vbroadcastsd 368(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm11
+ vbroadcastsd 336(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm10
+ vbroadcastsd 304(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm9
+ vbroadcastsd 272(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm8
+ vbroadcastsd 240(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm7
+ vbroadcastsd 208(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm6
+ vbroadcastsd 176(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm5
+ vbroadcastsd 144(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm4
+ vbroadcastsd 112(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm3
+ vbroadcastsd 80(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ //
+ vbroadcastsd 360(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm11
+ vbroadcastsd 328(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm10
+ vbroadcastsd 296(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm9
+ vbroadcastsd 264(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm8
+ vbroadcastsd 232(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm7
+ vbroadcastsd 200(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm6
+ vbroadcastsd 168(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm5
+ vbroadcastsd 136(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm4
+ vbroadcastsd 104(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 40(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ //
+ vbroadcastsd 352(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm11
+ vbroadcastsd 320(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm10
+ vbroadcastsd 288(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm9
+ vbroadcastsd 256(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm8
+ vbroadcastsd 224(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm7
+ vbroadcastsd 192(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm6
+ vbroadcastsd 160(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm5
+ vbroadcastsd 128(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm4
+ vbroadcastsd 96(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+
+ movq ARG6, %r10 // K
+ movq ARG7, %r11 // km
+
+ cmpl $4, %r11d
+ jge 0f
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vblendvpd %ymm15, %ymm11, %ymm14, %ymm11
+ vblendvpd %ymm15, %ymm10, %ymm14, %ymm10
+ vblendvpd %ymm15, %ymm9, %ymm14, %ymm9
+ vblendvpd %ymm15, %ymm8, %ymm14, %ymm8
+ vblendvpd %ymm15, %ymm7, %ymm14, %ymm7
+ vblendvpd %ymm15, %ymm6, %ymm14, %ymm6
+ vblendvpd %ymm15, %ymm5, %ymm14, %ymm5
+ vblendvpd %ymm15, %ymm4, %ymm14, %ymm4
+ vblendvpd %ymm15, %ymm3, %ymm14, %ymm3
+ vblendvpd %ymm15, %ymm2, %ymm14, %ymm2
+ vblendvpd %ymm15, %ymm1, %ymm14, %ymm1
+ vblendvpd %ymm15, %ymm0, %ymm14, %ymm0
+
+0:
+ vmovapd %ymm11, 352(%r10)
+ vmovapd %ymm10, 320(%r10)
+ vmovapd %ymm9, 288(%r10)
+ vmovapd %ymm8, 256(%r10)
+ vmovapd %ymm7, 224(%r10)
+ vmovapd %ymm6, 192(%r10)
+ vmovapd %ymm5, 160(%r10)
+ vmovapd %ymm4, 128(%r10)
+ vmovapd %ymm3, 96(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm0, 0(%r10)
+
+ movq ARG1, %r10 // n
+ movq ARG6, %r11 // K
+ movq ARG2, %r12 // V
+ movq ARG3, %r13 // sdd
+ sall $5, %r13d
+ movq ARG5, %r14 // D
+
+ // load block from C
+ vmovapd 0(%r14), %ymm0
+ vmovapd 32(%r14), %ymm1
+ vmovapd 64(%r14), %ymm2
+ vmovapd 96(%r14), %ymm3
+ vmovapd 128(%r14), %ymm4
+ vmovapd 160(%r14), %ymm5
+ vmovapd 192(%r14), %ymm6
+ vmovapd 224(%r14), %ymm7
+ vmovapd 256(%r14), %ymm8
+ vmovapd 288(%r14), %ymm9
+ vmovapd 320(%r14), %ymm10
+ vmovapd 352(%r14), %ymm11
+
+ // 0
+ vmovapd 0(%r11), %ymm12
+ vaddpd %ymm12, %ymm0, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 1
+ vmovapd 32(%r11), %ymm12
+ vaddpd %ymm12, %ymm1, %ymm1
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 136(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 168(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 200(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 2
+ vmovapd 64(%r11), %ymm12
+ vaddpd %ymm12, %ymm2, %ymm2
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 144(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 176(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 208(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 240(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 3
+ vmovapd 96(%r11), %ymm12
+ vaddpd %ymm12, %ymm3, %ymm3
+ vbroadcastsd 152(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 184(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 216(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 248(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 280(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 4
+ vmovapd 128(%r11), %ymm12
+ vaddpd %ymm12, %ymm4, %ymm4
+ vbroadcastsd 160(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 256(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 288(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 5
+ vmovapd 160(%r11), %ymm12
+ vaddpd %ymm12, %ymm5, %ymm5
+ vbroadcastsd 200(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 232(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 264(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 296(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 328(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 6
+ vmovapd 192(%r11), %ymm12
+ vaddpd %ymm12, %ymm6, %ymm6
+ vbroadcastsd 240(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+ vbroadcastsd 272(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 304(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 336(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 368(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 7
+ vmovapd 224(%r11), %ymm12
+ vaddpd %ymm12, %ymm7, %ymm7
+ vbroadcastsd 280(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 312(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 344(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 376(%r12, %r13, 1), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 8
+ vmovapd 256(%r11), %ymm12
+ vaddpd %ymm12, %ymm8, %ymm8
+ vbroadcastsd 288(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 320(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 352(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 9
+ vmovapd 288(%r11), %ymm12
+ vaddpd %ymm12, %ymm9, %ymm9
+ vbroadcastsd 328(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 360(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 10
+ vmovapd 320(%r11), %ymm12
+ vaddpd %ymm12, %ymm10, %ymm10
+ vbroadcastsd 368(%r12, %r13, 2), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm11
+ // 11
+ vmovapd 352(%r11), %ymm12
+ vaddpd %ymm12, %ymm11, %ymm11
+
+ // store block to C
+ vmovapd %ymm0, 0(%r14)
+ vmovapd %ymm1, 32(%r14)
+ vmovapd %ymm2, 64(%r14)
+ vmovapd %ymm3, 96(%r14)
+ vmovapd %ymm4, 128(%r14)
+ vmovapd %ymm5, 160(%r14)
+ vmovapd %ymm6, 192(%r14)
+ vmovapd %ymm7, 224(%r14)
+ vmovapd %ymm8, 256(%r14)
+ vmovapd %ymm9, 288(%r14)
+ vmovapd %ymm10, 320(%r14)
+ vmovapd %ymm11, 352(%r14)
+
+ subl $12, %r10d
+ addq $384, %r12
+ addq $384, %r14
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_4X12_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_4x12_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_4x12_lib4
+#endif
+#endif
+
+100:
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb12_r_4_lib4, .-kernel_dlarfb12_r_4_lib4
+#endif
+
+
+
+
+
+ // 1 2 3 4 5
+// void kernel_dlarfb4_r_12_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb4_r_12_lib4
+ .type kernel_dlarfb4_r_12_lib4, @function
+kernel_dlarfb4_r_12_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb4_r_12_lib4
+_kernel_dlarfb4_r_12_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb4_r_12_lib4
+ .def kernel_dlarfb4_r_12_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_12_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+// vxorpd %ymm0, %ymm0, %ymm0
+// vmovapd %ymm0, %ymm1
+// vmovapd %ymm0, %ymm2
+// vmovapd %ymm0, %ymm3
+// vmovapd %ymm0, %ymm4
+// vmovapd %ymm0, %ymm5
+// vmovapd %ymm0, %ymm6
+// vmovapd %ymm0, %ymm7
+// vmovapd %ymm0, %ymm8
+// vmovapd %ymm0, %ymm9
+// vmovapd %ymm0, %ymm10
+// vmovapd %ymm0, %ymm11
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG5, %r12 // sdd
+ sall $5, %r12d
+ movq ARG2, %r13 // V
+
+ //
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r12, 1), %ymm4
+ vmovapd 0(%r11, %r12, 2), %ymm8
+ //
+ vmovapd 32(%r11), %ymm1
+ vmovapd 32(%r11, %r12, 1), %ymm5
+ vmovapd 32(%r11, %r12, 2), %ymm9
+ vbroadcastsd 32(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm1, %ymm0
+ vfmadd231pd %ymm13, %ymm5, %ymm4
+ vfmadd231pd %ymm13, %ymm9, %ymm8
+ //
+ vmovapd 64(%r11), %ymm2
+ vmovapd 64(%r11, %r12, 1), %ymm6
+ vmovapd 64(%r11, %r12, 2), %ymm10
+ vbroadcastsd 64(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm2, %ymm0
+ vfmadd231pd %ymm13, %ymm6, %ymm4
+ vfmadd231pd %ymm13, %ymm10, %ymm8
+ vbroadcastsd 72(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm2, %ymm1
+ vfmadd231pd %ymm13, %ymm6, %ymm5
+ vfmadd231pd %ymm13, %ymm10, %ymm9
+ //
+ vmovapd 96(%r11), %ymm3
+ vmovapd 96(%r11, %r12, 1), %ymm7
+ vmovapd 96(%r11, %r12, 2), %ymm11
+ vbroadcastsd 96(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm0
+ vfmadd231pd %ymm13, %ymm7, %ymm4
+ vfmadd231pd %ymm13, %ymm11, %ymm8
+ vbroadcastsd 104(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm1
+ vfmadd231pd %ymm13, %ymm7, %ymm5
+ vfmadd231pd %ymm13, %ymm11, %ymm9
+ vbroadcastsd 112(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm2
+ vfmadd231pd %ymm13, %ymm7, %ymm6
+ vfmadd231pd %ymm13, %ymm11, %ymm10
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_12x4_lib4
+#endif
+#endif
+
+ movq ARG3, %r10 // T
+
+ //
+ vbroadcastsd 120(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vmulpd %ymm11, %ymm12, %ymm11
+ //
+ vbroadcastsd 112(%r10), %ymm12
+ vfmadd231pd %ymm2, %ymm12, %ymm3
+ vfmadd231pd %ymm6, %ymm12, %ymm7
+ vfmadd231pd %ymm10, %ymm12, %ymm11
+ vbroadcastsd 80(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vmulpd %ymm10, %ymm12, %ymm10
+ //
+ vbroadcastsd 104(%r10), %ymm12
+ vfmadd231pd %ymm1, %ymm12, %ymm3
+ vfmadd231pd %ymm5, %ymm12, %ymm7
+ vfmadd231pd %ymm9, %ymm12, %ymm11
+ vbroadcastsd 72(%r10), %ymm12
+ vfmadd231pd %ymm1, %ymm12, %ymm2
+ vfmadd231pd %ymm5, %ymm12, %ymm6
+ vfmadd231pd %ymm9, %ymm12, %ymm10
+ vbroadcastsd 40(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vmulpd %ymm9, %ymm12, %ymm9
+ //
+ vbroadcastsd 96(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm3
+ vfmadd231pd %ymm4, %ymm12, %ymm7
+ vfmadd231pd %ymm8, %ymm12, %ymm11
+ vbroadcastsd 64(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm2
+ vfmadd231pd %ymm4, %ymm12, %ymm6
+ vfmadd231pd %ymm8, %ymm12, %ymm10
+ vbroadcastsd 32(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm1
+ vfmadd231pd %ymm4, %ymm12, %ymm5
+ vfmadd231pd %ymm8, %ymm12, %ymm9
+ vbroadcastsd 0(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+ vmulpd %ymm8, %ymm12, %ymm8
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // V
+ movq ARG4, %r12 // D
+ movq ARG5, %r13 // sdd
+ sall $5, %r13d
+
+ //
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vmovapd 0(%r12, %r13, 2), %ymm15
+ vaddpd %ymm12, %ymm0, %ymm12
+ vaddpd %ymm14, %ymm4, %ymm14
+ vaddpd %ymm15, %ymm8, %ymm15
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+ vmovapd %ymm15, 0(%r12, %r13, 2)
+ //
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vmovapd 32(%r12, %r13, 2), %ymm15
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vaddpd %ymm12, %ymm1, %ymm12
+ vaddpd %ymm14, %ymm5, %ymm14
+ vaddpd %ymm15, %ymm9, %ymm15
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+ vmovapd %ymm15, 32(%r12, %r13, 2)
+ //
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vmovapd 64(%r12, %r13, 2), %ymm15
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vaddpd %ymm12, %ymm2, %ymm12
+ vaddpd %ymm14, %ymm6, %ymm14
+ vaddpd %ymm15, %ymm10, %ymm15
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+ vmovapd %ymm15, 64(%r12, %r13, 2)
+ //
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vmovapd 96(%r12, %r13, 2), %ymm15
+ vbroadcastsd 96(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vfmadd231pd %ymm8, %ymm13, %ymm15
+ vbroadcastsd 104(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vfmadd231pd %ymm9, %ymm13, %ymm15
+ vbroadcastsd 112(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vfmadd231pd %ymm10, %ymm13, %ymm15
+ vaddpd %ymm12, %ymm3, %ymm12
+ vaddpd %ymm14, %ymm7, %ymm14
+ vaddpd %ymm15, %ymm11, %ymm15
+ vmovapd %ymm12, 96(%r12)
+ vmovapd %ymm14, 96(%r12, %r13, 1)
+ vmovapd %ymm15, 96(%r12, %r13, 2)
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_12X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_12x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_12x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb4_r_12_lib4, .-kernel_dlarfb4_r_12_lib4
+#endif
+
+
+
+
+
+// read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+LC04: // { 11.5 10.5 9.5 8.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1075904512
+ .long 0
+ .long 1076035584
+ .long 0
+ .long 1076166656
+ .long 0
+ .long 1076297728
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC05: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgemm_4x4_lib4.S b/kernel/avx2/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..c9bf696
--- /dev/null
+++ b/kernel/avx2/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,9433 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ vxorpd %ymm4, %ymm4, %ymm4
+ vmovapd %ymm4, %ymm5
+ vmovapd %ymm4, %ymm6
+ vmovapd %ymm4, %ymm7
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastsd -32(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd -24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd -16(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd -8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastsd -32(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd -24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd -16(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd -8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+
+ addq $32, %r11
+ addq $32, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddpd %ymm4, %ymm0, %ymm0
+ vaddpd %ymm5, %ymm1, %ymm1
+ vaddpd %ymm6, %ymm2, %ymm2
+ vaddpd %ymm7, %ymm3, %ymm3
+
+5: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ vxorpd %ymm4, %ymm4, %ymm4
+ vmovapd %ymm4, %ymm5
+ vmovapd %ymm4, %ymm5
+ vmovapd %ymm4, %ymm6
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r12), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vfmadd231pd %ymm10, %ymm13, %ymm4
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm5
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vfmadd231pd %ymm10, %ymm13, %ymm7
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vfmadd231pd %ymm10, %ymm14, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vfmadd231pd %ymm10, %ymm13, %ymm4
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm5
+ vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vfmadd231pd %ymm10, %ymm13, %ymm7
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vfmadd231pd %ymm10, %ymm14, %ymm6
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r12), %ymm13 // B[4]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+
+ // unroll 1
+ vmovapd 64(%r12), %ymm12 // B[8]
+ vfmadd231pd %ymm10, %ymm13, %ymm4
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm5
+ vmovapd 64(%r11), %ymm8 // A0[8]
+
+ vfmadd231pd %ymm10, %ymm13, %ymm7
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vfmadd231pd %ymm10, %ymm14, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r12), %ymm13 // B[12]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r12
+
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ addq $128, %r11
+
+
+ // unroll 3
+// vmovapd 0(%r12), %ymm12 // B[0]
+ vfmadd231pd %ymm10, %ymm13, %ymm4
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm5
+// vmovapd 0(%r11), %ymm8 // A0[0]
+
+ vfmadd231pd %ymm10, %ymm13, %ymm7
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+
+ vfmadd231pd %ymm10, %ymm14, %ymm6
+
+
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r12), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ addq $32, %r11
+
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ addq $32, %r12
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vfmadd231pd %ymm8, %ymm14, %ymm3
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+2: // return
+
+ vaddpd %ymm4, %ymm0, %ymm0
+ vaddpd %ymm5, %ymm1, %ymm1
+ vaddpd %ymm6, %ymm2, %ymm2
+ vaddpd %ymm7, %ymm3, %ymm3
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ vxorpd %ymm4, %ymm4, %ymm4
+ vmovapd %ymm4, %ymm5
+ vmovapd %ymm4, %ymm6
+ vmovapd %ymm4, %ymm7
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastsd -32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd -24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd -16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd -8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastsd -32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd -24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd -16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd -8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+
+ addq $32, %r11
+ addq $32, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddpd %ymm4, %ymm0, %ymm0
+ vaddpd %ymm5, %ymm1, %ymm1
+ vaddpd %ymm6, %ymm2, %ymm2
+ vaddpd %ymm7, %ymm3, %ymm3
+
+5: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x4_lib4, @function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ vxorpd %ymm4, %ymm4, %ymm4
+ vmovapd %ymm4, %ymm5
+ vmovapd %ymm4, %ymm6
+ vmovapd %ymm4, %ymm7
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddpd %ymm4, %ymm0, %ymm0
+ vaddpd %ymm5, %ymm1, %ymm1
+ vaddpd %ymm6, %ymm2, %ymm2
+ vaddpd %ymm7, %ymm3, %ymm3
+
+5: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nn_4x4_lib4, @function
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ vxorpd %ymm4, %ymm4, %ymm4
+ vmovapd %ymm4, %ymm5
+ vmovapd %ymm4, %ymm6
+ vmovapd %ymm4, %ymm7
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfnmadd231pd %ymm14, %ymm12, %ymm7
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfnmadd231pd %ymm13, %ymm12, %ymm3
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddpd %ymm4, %ymm0, %ymm0
+ vaddpd %ymm5, %ymm1, %ymm1
+ vaddpd %ymm6, %ymm2, %ymm2
+ vaddpd %ymm7, %ymm3, %ymm3
+
+5: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nn_4x4_lib4, .-inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- B
+// r12 <- C
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- ?
+// r12 <- ?
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_4x4_lib4, @function
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r12), %ymm12
+ vbroadcastsd 0(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd 8(%r11), %ymm13
+ subl $4, %r10d
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vbroadcastsd 16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vbroadcastsd 24(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vmovapd %ymm12, 0(%r12)
+
+ vmovapd 32(%r12), %ymm12
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd 40(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vbroadcastsd 48(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vbroadcastsd 56(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vmovapd %ymm12, 32(%r12)
+
+ vmovapd 64(%r12), %ymm12
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vbroadcastsd 80(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vbroadcastsd 88(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vmovapd %ymm12, 64(%r12)
+
+ vmovapd 96(%r12), %ymm12
+ vbroadcastsd 96(%r11), %ymm13
+ addq $128, %r11
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd -24(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vbroadcastsd -16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vbroadcastsd -8(%r11), %ymm13
+ addq $128, %r12
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vmovapd %ymm12, -32(%r12)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r12), %ymm12
+ vbroadcastsd 0(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd 8(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vbroadcastsd 16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vbroadcastsd 24(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vmovapd %ymm12, 0(%r12)
+
+ addq $32, %r11
+ addq $32, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_4x4_lib4, .-inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x4_lib4, @function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %ymm8
+ vbroadcastsd 0(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ vmovapd 32(%r10), %ymm8
+ vbroadcastsd 32(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 64(%r10), %ymm8
+ vbroadcastsd 64(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 72(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 80(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 96(%r10), %ymm8
+ vbroadcastsd 96(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 104(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 112(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 120(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ addq $128, %r10
+ addq $128, %r11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ addq $32, %r11
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ addq $32, %r11
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ addq $32, %r11
+ vbroadcastsd 24(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_4x4_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jg 0f
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r14d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r12 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A+3*bs*sizeof(double)
+ addq %r13, %r12
+ subq $8, %r12 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r14d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r12 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A+2*bs*sizeof(double)
+ addq %r13, %r12
+ subq $16, %r12 // B+bs*sdb*sizeof(double)-2
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 72(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 104(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r12 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-3
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 32(%r11), %ymm8
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 72(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 64(%r11), %ymm8
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 48(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 56(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_4x4_lib4, .-inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_4x4_gen_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ cmpl $0, %r14d
+ jg 0f // offB>0
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+0:
+ cmpl $1, %r14d
+ jg 1f // offB>1
+
+ // offB==1
+
+ addq $8, %r12 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+1:
+ cmpl $2, %r14d
+ jg 2f // offB>2
+
+ // offB==2
+
+ addq $16, %r12 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $1, %r10d // k-2
+ addq $32, %r11 // A+2*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r12 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq $8, %r12 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ subl $1, %r10d // k-4
+ addq $32, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_4x4_gen_lib4, .-inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dlauum_nt_4x4_lib4, @function
+inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ vmovapd 32(%r10), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 32(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 40(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 64(%r10), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 64(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 72(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 80(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 96(%r10), %ymm8
+ vbroadcastsd 96(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 104(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 112(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 120(%r11), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ addq $128, %r10
+ addq $128, %r11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dlauum_nt_4x4_lib4, .-inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for dlauum
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dlauum_nt_4x4_vs_lib4, @function
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dlauum_nt_4x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %ymm8
+ subl $1, %r10d
+ vbroadcastsd 0(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vbroadcastsd 8(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vbroadcastsd 16(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vbroadcastsd 24(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ addq $32, %r11
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dlauum_nt_4x4_vs_lib4, .-inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif
+#endif
+
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_gen_lib4, @function
+inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vmovapd 32(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 96(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+
+ jmp 3f
+
+0:
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm3
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm3
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_gen_lib4, .-inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_4x4_lib4, @function
+inner_scale_a0_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_4x4_lib4, .-inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_4x4_lib4, @function
+inner_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_4x4_lib4, .-inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_gen_lib4, @function
+inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_gen_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vmovapd 32(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vmovapd 96(%r13), %ymm12
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+
+ jmp 3f
+
+0:
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm3
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm3
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r13), %ymm13
+ vmovapd 32(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 96(%r13), %ymm13
+ vmovapd 96(%r15), %ymm14
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm3
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_gen_lib4, .-inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif
+#endif
+
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ cmpl $2, %r11d
+ jl 0f // ret
+// vperm2f128 $0x00, %ymm0, %ymm0, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ cmpl $4, %r11d
+ jl 0f // ret
+// vperm2f128 $0x11, %ymm2, %ymm2, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+
+// vextractf128 $0x1, %ymm3, %xmm13
+// vpermilpd $0x3, %xmm13, %xmm13
+ vpermpd $0xff, %ymm3, %ymm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+ #if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ cmpl $2, %r12d
+ vmulpd %ymm0, %ymm13, %ymm0
+
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ cmpl $3, %r12d
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ cmpl $4, %r12d
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_4x4_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_4x4_lib4, .-inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ cmpl $3, %r11d
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ cmpl $4, %r11d
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_4x4_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm2
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm1
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm0
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm1
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm0
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm0
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_4x4_lib4, .-inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm2
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm1
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm0
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm1
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm0
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm0
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_run_inv_4x4_lib4, @function
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_run_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_4x4_lib4:
+#endif
+#endif
+
+ // first column
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+ // second column
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm1
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+
+ // third column
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm2
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm2
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+
+ // fourth column
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm3
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm3
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm3
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_run_inv_4x4_lib4, .-inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lln_one_4x4_lib4, @function
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lln_one_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_4x4_lib4:
+#endif
+#endif
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r10), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vpermpd $0x00, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+
+ vmovapd 32(%r10), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vpermpd $0x55, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+
+ vmovapd 64(%r10), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vpermpd $0xaa, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vpermpd $0xaa, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lln_one_4x4_lib4, .-inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_4x4_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r11), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vbroadcastsd 0(%r11), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_4x4_lib4, .-inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// r12 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0xf, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r11), %ymm12
+
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm14
+ vpermilpd $0x0, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r11), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_4x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_4x4_lib4, @function
+inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_4x4_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+// vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm0, %ymm12, %ymm12
+ vmovapd %ymm0, %ymm12
+ vdivsd %xmm0, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r10)
+ vmulpd %ymm0, %ymm13, %ymm0
+ vblendpd $0x1, %ymm12, %ymm0, %ymm0
+
+ // second column
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vblendpd $0x2, %ymm1, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r10)
+ vmulpd %ymm1, %ymm13, %ymm1
+ vblendpd $0x3, %ymm12, %ymm1, %ymm1
+
+ // third column
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vblendpd $0x2, %ymm2, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vblendpd $0x4, %ymm2, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm2, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r10)
+ vmulpd %ymm2, %ymm13, %ymm2
+ vblendpd $0x7, %ymm12, %ymm2, %ymm2
+
+ // fourth column
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vblendpd $0x2, %ymm3, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vblendpd $0x4, %ymm3, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vblendpd $0x8, %ymm3, %ymm12, %ymm12
+
+ vpermpd $0xff, %ymm3, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r10)
+// vmulpd %ymm3, %ymm13, %ymm3
+ vblendpd $0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_4x4_lib4, .-inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_gen_lib4, @function
+inner_store_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm12, %ymm15
+ vandpd %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm15, 64(%r11)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm15, 96(%r11)
+
+ jmp 4f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm12, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm12, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm12, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm12, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+3:
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+4:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_gen_lib4, .-inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_gen_lib4, @function
+inner_store_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm12, %ymm15
+ vandpd %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm2, %ymm15, 64(%r11)
+ je 3f // end
+ vblendpd $0x4, %ymm14, %ymm15, %ymm15
+ vmaskmovpd %ymm3, %ymm15, 96(%r11)
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vperm2f128 $0x01, %ymm0, %ymm0, %ymm12
+ vshufpd $0x5, %ymm12, %ymm0, %ymm0
+
+ vperm2f128 $0x01, %ymm1, %ymm1, %ymm12
+ vshufpd $0x5, %ymm12, %ymm1, %ymm1
+
+ vperm2f128 $0x01, %ymm2, %ymm2, %ymm12
+ vshufpd $0x5, %ymm12, %ymm2, %ymm2
+
+ vperm2f128 $0x01, %ymm3, %ymm3, %ymm12
+ vshufpd $0x5, %ymm12, %ymm3, %ymm3
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm15, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm14
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm14, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x2, %ymm14, %ymm13, %ymm13
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 1)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_gen_lib4, .-inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .type kernel_dgemm_nt_4x4_gen_lib4, @function
+kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_gen_lib4
+_kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .def kernel_dgemm_nt_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_gen_lib4, .-kernel_dgemm_nt_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nn_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .type kernel_dgemm_nn_4x4_lib4, @function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .def kernel_dgemm_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_dgemm_nn_4x4_gen_lib4(int k, double *alpha, double *A, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_gen_lib4
+ .type kernel_dgemm_nn_4x4_gen_lib4, @function
+kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_gen_lib4
+_kernel_dgemm_nn_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_gen_lib4
+ .def kernel_dgemm_nn_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_gen_lib4, .-kernel_dgemm_nn_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dsyrk_nt_l_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_gen_lib4
+ .type kernel_dsyrk_nt_l_4x4_gen_lib4, @function
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_gen_lib4
+_kernel_dsyrk_nt_l_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_gen_lib4
+ .def kernel_dsyrk_nt_l_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_gen_lib4, .-kernel_dsyrk_nt_l_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .type kernel_dtrmm_nn_rl_4x4_lib4, @function
+kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_4x4_lib4
+_kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .def kernel_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_4x4_lib4, .-kernel_dtrmm_nn_rl_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dtrmm_nn_rl_4x4_gen_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+ .type kernel_dtrmm_nn_rl_4x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_4x4_gen_lib4
+_kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_4x4_gen_lib4
+ .def kernel_dtrmm_nn_rl_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // offsetD
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG10, %r13 // m0
+ movq ARG11, %r14 // m1
+ movq ARG12, %r15 // n0
+ movq ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_4x4_gen_lib4, .-kernel_dtrmm_nn_rl_4x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10
+ movq ARG4, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9
+// void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_4x4_lib4
+ .type kernel_dtrsm_nt_rl_one_4x4_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_4x4_lib4
+_kernel_dtrsm_nt_rl_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_4x4_lib4
+ .def kernel_dtrsm_nt_rl_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_4x4_lib4, .-kernel_dtrsm_nt_rl_one_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_one_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+ .type kernel_dtrsm_nt_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_4x4_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_lib4
+ .def kernel_dtrsm_nt_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_4x4_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nn_ru_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+ .type kernel_dtrsm_nn_ru_inv_4x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_4x4_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_lib4
+ .def kernel_dtrsm_nn_ru_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_4x4_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nn_ru_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nn_ll_one_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_4x4_lib4
+ .type kernel_dtrsm_nn_ll_one_4x4_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_4x4_lib4
+_kernel_dtrsm_nn_ll_one_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_4x4_lib4
+ .def kernel_dtrsm_nn_ll_one_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_4x4_lib4, .-kernel_dtrsm_nn_ll_one_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+ .type kernel_dtrsm_nn_ll_one_4x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+ .def kernel_dtrsm_nn_ll_one_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_4x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dtrsm_nn_lu_inv_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+ .type kernel_dtrsm_nn_lu_inv_4x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_4x4_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_lib4
+ .def kernel_dtrsm_nn_lu_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_4x4_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nn_lu_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // inv_diag_E
+ movq ARG9, %r12 // km
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_4X4_VS_LIB4 // TODO
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_4x4_vs_lib4 // TODO
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_4x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dgetrf_nn_4x4_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_4x4_lib4
+ .type kernel_dgetrf_nn_4x4_lib4, @function
+kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_4x4_lib4
+_kernel_dgetrf_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_4x4_lib4
+ .def kernel_dgetrf_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG7, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_4x4_lib4, .-kernel_dgetrf_nn_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgetrf_nn_4x4_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_4x4_vs_lib4
+ .type kernel_dgetrf_nn_4x4_vs_lib4, @function
+kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_4x4_vs_lib4
+_kernel_dgetrf_nn_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_4x4_vs_lib4
+ .def kernel_dgetrf_nn_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // B
+ movq ARG4, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG7, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG6, %r10 // D
+
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_4x4_vs_lib4, .-kernel_dgetrf_nn_4x4_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dlauum_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlauum_nt_4x4_lib4
+ .type kernel_dlauum_nt_4x4_lib4, @function
+kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlauum_nt_4x4_lib4
+_kernel_dlauum_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlauum_nt_4x4_lib4
+ .def kernel_dlauum_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DLAUUM_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dlauum_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dlauum_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dlauum_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlauum_nt_4x4_vs_lib4
+ .type kernel_dlauum_nt_4x4_vs_lib4, @function
+kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlauum_nt_4x4_vs_lib4
+_kernel_dlauum_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlauum_nt_4x4_vs_lib4
+ .def kernel_dlauum_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dlauum_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DLAUUM_NT_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dlauum_nt_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlauum_nt_4x4_vs_lib4, .-kernel_dlauum_nt_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4
+// void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb4_r_4_lib4
+ .type kernel_dlarfb4_r_4_lib4, @function
+kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb4_r_4_lib4
+_kernel_dlarfb4_r_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb4_r_4_lib4
+ .def kernel_dlarfb4_r_4_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+// vxorpd %ymm0, %ymm0, %ymm0
+// vmovapd %ymm0, %ymm1
+// vmovapd %ymm0, %ymm2
+// vmovapd %ymm0, %ymm3
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG2, %r12 // V
+
+ //
+ vmovapd 0(%r11), %ymm0
+ //
+ vmovapd 32(%r11), %ymm1
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm13, %ymm1, %ymm0
+ //
+ vmovapd 64(%r11), %ymm2
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm13, %ymm2, %ymm0
+ vbroadcastsd 72(%r12), %ymm13
+ vfmadd231pd %ymm13, %ymm2, %ymm1
+ //
+ vmovapd 96(%r11), %ymm3
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm0
+ vbroadcastsd 104(%r12), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm1
+ vbroadcastsd 112(%r12), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm2
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+ movq ARG3, %r10 // T
+
+ //
+ vbroadcastsd 120(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ //
+ vbroadcastsd 112(%r10), %ymm12
+ vfmadd231pd %ymm2, %ymm12, %ymm3
+ vbroadcastsd 80(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ //
+ vbroadcastsd 104(%r10), %ymm12
+ vfmadd231pd %ymm1, %ymm12, %ymm3
+ vbroadcastsd 72(%r10), %ymm12
+ vfmadd231pd %ymm1, %ymm12, %ymm2
+ vbroadcastsd 40(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ //
+ vbroadcastsd 96(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm3
+ vbroadcastsd 64(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm2
+ vbroadcastsd 32(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm1
+ vbroadcastsd 0(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // V
+ movq ARG4, %r12 // D
+
+ //
+ vmovapd 0(%r12), %ymm12
+ vaddpd %ymm12, %ymm0, %ymm12
+ vmovapd %ymm12, 0(%r12)
+ //
+ vmovapd 32(%r12), %ymm12
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vaddpd %ymm12, %ymm1, %ymm12
+ vmovapd %ymm12, 32(%r12)
+ //
+ vmovapd 64(%r12), %ymm12
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vaddpd %ymm12, %ymm2, %ymm12
+ vmovapd %ymm12, 64(%r12)
+ //
+ vmovapd 96(%r12), %ymm12
+ vbroadcastsd 96(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vbroadcastsd 104(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vbroadcastsd 112(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vaddpd %ymm12, %ymm3, %ymm12
+ vmovapd %ymm12, 96(%r12)
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_4x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb4_r_4_lib4, .-kernel_dlarfb4_r_4_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { -1 -1 -1 1 }
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { -1 -1 -1 -1 }
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 3.5 2.5 1.5 0.5 }
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 7.5 6.5 5.5 4.5 }
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC04: // { 1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgemm_8x4_lib4.S b/kernel/avx2/kernel_dgemm_8x4_lib4.S
new file mode 100644
index 0000000..82a5a86
--- /dev/null
+++ b/kernel/avx2/kernel_dgemm_8x4_lib4.S
@@ -0,0 +1,12995 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_8x4_lib4, @function
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_8x4_lib4:
+#endif
+#endif
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ addq $128, %r13
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ addq $128, %r13
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vmovapd 0(%r13), %ymm12 // B[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vfmadd231pd %ymm9, %ymm14, %ymm5
+
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $4, %r10d
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ vfmadd231pd %ymm9, %ymm14, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vfmadd231pd %ymm10, %ymm13, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vfmadd231pd %ymm11, %ymm14, %ymm5
+
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vfmadd231pd %ymm10, %ymm13, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm7
+
+ vfmadd231pd %ymm10, %ymm14, %ymm2
+ vfmadd231pd %ymm11, %ymm14, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vfmadd231pd %ymm9, %ymm14, %ymm5
+
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ vfmadd231pd %ymm9, %ymm14, %ymm6
+
+
+ // unroll 3
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vfmadd231pd %ymm10, %ymm13, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm1
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vfmadd231pd %ymm11, %ymm14, %ymm5
+
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vfmadd231pd %ymm10, %ymm13, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm7
+
+ vfmadd231pd %ymm10, %ymm14, %ymm2
+ vfmadd231pd %ymm11, %ymm14, %ymm6
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vmovapd 32(%r13), %ymm13 // B[4]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 32(%r11), %ymm10 // A0[4]
+ vfmadd231pd %ymm9, %ymm14, %ymm5
+
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1[4]
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $4, %r10d
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ vfmadd231pd %ymm9, %ymm14, %ymm6
+
+ // unroll 1
+ vmovapd 64(%r13), %ymm12 // B[8]
+ vfmadd231pd %ymm10, %ymm13, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm1
+ vmovapd 64(%r11), %ymm8 // A0[8]
+ vfmadd231pd %ymm11, %ymm14, %ymm5
+
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1[8]
+ vfmadd231pd %ymm10, %ymm13, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm7
+
+ vfmadd231pd %ymm10, %ymm14, %ymm2
+ vfmadd231pd %ymm11, %ymm14, %ymm6
+
+ // unroll 2
+ vmovapd 96(%r13), %ymm13 // B[12]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm12
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ vmovapd 96(%r11), %ymm10 // A0[12]
+ vfmadd231pd %ymm9, %ymm14, %ymm5
+
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1[12]
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ addq $128, %r13
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ vfmadd231pd %ymm9, %ymm14, %ymm6
+
+
+ // unroll 3
+// vmovapd 0(%r13), %ymm12 // B[0]
+ vfmadd231pd %ymm10, %ymm13, %ymm0
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm4
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm13
+ vfmadd231pd %ymm10, %ymm14, %ymm1
+// vmovapd 0(%r11), %ymm8 // A0[0]
+ vfmadd231pd %ymm11, %ymm14, %ymm5
+// cmpl $3, %r10d
+
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vfmadd231pd %ymm10, %ymm13, %ymm3
+ vshufpd $0x5, %ymm13, %ymm13, %ymm14
+ vfmadd231pd %ymm11, %ymm13, %ymm7
+
+ vfmadd231pd %ymm10, %ymm14, %ymm2
+ vfmadd231pd %ymm11, %ymm14, %ymm6
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r13), %ymm12 // B[0]
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vshufpd $0x5, %ymm12, %ymm12, %ymm14
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ addq $32, %r11
+
+ vfmadd231pd %ymm8, %ymm14, %ymm1
+ addq $32, %r13
+ vfmadd231pd %ymm9, %ymm14, %ymm5
+
+ vperm2f128 $0x1, %ymm14, %ymm14, %ymm14
+ vfmadd231pd %ymm8, %ymm14, %ymm3
+ vfmadd231pd %ymm9, %ymm14, %ymm7
+
+ vshufpd $0x5, %ymm14, %ymm14, %ymm14
+ vfmadd231pd %ymm8, %ymm14, %ymm2
+ subl $1, %r10d
+ vfmadd231pd %ymm9, %ymm14, %ymm6
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_8x4_lib4, .-inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_8x4_lib4, @function
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ addq $128, %r13
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ addq $128, %r13
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_8x4_lib4, .-inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// rbx <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_8x4_lib4, @function
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ addq %r14, %r13
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm0
+ vfmadd231pd %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm1
+ vfmadd231pd %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm2
+ vfmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm10, %ymm12, %ymm3
+ addq %r14, %r13
+ vfmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_8x4_lib4, .-inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k
+// r11 <- A+4*sda*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nn_8x4_lib4, @function
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 2) // software prefetch
+ prefetcht0 64(%r13, %r14, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ addq %r14, %r13
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A0
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 1
+ vbroadcastsd 8(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm8 // A0
+
+ vbroadcastsd 40(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 72(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 104(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastsd 16(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A0
+
+ vbroadcastsd 48(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A1
+
+ vbroadcastsd 80(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 112(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 3
+ vbroadcastsd 24(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm0
+ vfnmadd231pd %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm8 // A0
+
+ vbroadcastsd 56(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm1
+ vfnmadd231pd %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm9 // A1
+
+ vbroadcastsd 88(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm2
+ vfnmadd231pd %ymm11, %ymm12, %ymm6
+
+ vbroadcastsd 120(%r13), %ymm12
+ vfnmadd231pd %ymm10, %ymm12, %ymm3
+ addq %r14, %r13
+ vfnmadd231pd %ymm11, %ymm12, %ymm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ vmovapd 0(%r11), %ymm8 // A0[0]
+ vmovapd 0(%r11, %r12, 1), %ymm9 // A1[0]
+ vbroadcastsd 0(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm0
+ vfnmadd231pd %ymm9, %ymm12, %ymm4
+
+ vbroadcastsd 32(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm1
+ vfnmadd231pd %ymm9, %ymm12, %ymm5
+ addq $32, %r11
+
+ vbroadcastsd 64(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm2
+ vfnmadd231pd %ymm9, %ymm12, %ymm6
+ subl $1, %r10d
+
+ vbroadcastsd 96(%r13), %ymm12
+ vfnmadd231pd %ymm8, %ymm12, %ymm3
+ vfnmadd231pd %ymm9, %ymm12, %ymm7
+ addq $8, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nn_8x4_lib4, .-inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x8_lib4, @function
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+ prefetcht0 128(%r12, %r13, 2) // software prefetch
+ prefetcht0 192(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq %r13, %r12
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd 32(%r11), %ymm14 // A
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastsd 8(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastsd 40(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 72(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 104(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 136(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 168(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 200(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 232(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastsd 16(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vmovapd -32(%r11), %ymm14 // A
+ vbroadcastsd 48(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 80(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 112(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 144(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 176(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 208(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 240(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+
+ // unroll 0
+ vbroadcastsd 24(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 56(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vbroadcastsd 88(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vbroadcastsd 120(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vbroadcastsd 152(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vbroadcastsd 184(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vbroadcastsd 216(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm6
+ vbroadcastsd 248(%r12), %ymm12 // B
+ vfmadd231pd %ymm14, %ymm12, %ymm7
+ addq %r13, %r12
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastsd 0(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm0
+ vbroadcastsd 32(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm1
+ vbroadcastsd 64(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm2
+ vbroadcastsd 96(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm13, %ymm12, %ymm7
+
+ addq $32, %r11
+ addq $8, %r12
+ subl $1, %r10d
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x8_lib4, .-inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- B
+// r12 <- C
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- ?
+// r12 <- ?
+// r13 <- 32*sdc
+// ymm0 <- [a00 a10 a20 a30]
+// ymm1 <- [a01 a11 a21 a31]
+// ymm2 <- [a02 a12 a22 a32]
+// ymm3 <- [a03 a13 a23 a33]
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgebp_add_nn_8x4_lib4, @function
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgebp_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgebp_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgebp_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ cmpl $3, %r10d
+ jle 2f // cleanup loop
+
+ // main loop
+ .p2align 3
+1:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vbroadcastsd 0(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd 8(%r11), %ymm13
+ subl $4, %r10d
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vbroadcastsd 16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vbroadcastsd 24(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd 40(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vbroadcastsd 48(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vbroadcastsd 56(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vbroadcastsd 80(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vbroadcastsd 88(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vbroadcastsd 96(%r11), %ymm13
+ addq $128, %r11
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd -24(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vbroadcastsd -16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vbroadcastsd -8(%r11), %ymm13
+ addq $128, %r12
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vmovapd %ymm12, -32(%r12)
+ vmovapd %ymm14, -32(%r12, %r13, 1)
+
+ cmpl $3, %r10d
+ jg 1b // main loop
+
+ cmpl $0, %r10d
+ jle 0f // return
+
+ // cleanup loop
+2:
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vbroadcastsd 0(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd 8(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vbroadcastsd 16(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vbroadcastsd 24(%r11), %ymm13
+ vfmadd231pd %ymm3, %ymm13, %ymm12
+ vfmadd231pd %ymm7, %ymm13, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+
+ addq $32, %r11
+ addq $32, %r12
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+ jg 2b // main loop
+
+ // return
+0:
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgebp_add_nn_8x4_lib4, .-inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_8x4_lib4, @function
+inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %ebx
+ subl %r15d, %ebx // 4-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,4-offsetB)
+
+ movl %r15d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r13 // B+offsetB*sizeof(double)
+
+ movq %r11, %rax // A1 <- A0
+ addq %r12, %rax // A1 <- A0 + 4*sda*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm12 // A0[0]
+ vmovapd 0(%rax), %ymm14 // A1[0]
+ vbroadcastsd 0(%r13), %ymm13 // B[0]
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vfmadd231pd %ymm14, %ymm13, %ymm4
+ vbroadcastsd 32(%r13), %ymm13 // B[1]
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vfmadd231pd %ymm14, %ymm13, %ymm5
+ vbroadcastsd 64(%r13), %ymm13 // B[2]
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vfmadd231pd %ymm14, %ymm13, %ymm6
+ vbroadcastsd 96(%r13), %ymm13 // B[3]
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vfmadd231pd %ymm14, %ymm13, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A0+1*bs*sizeof(float)
+ addq $32, %rax // A1+1*bs*sizeof(float)
+ addq $8, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_8x4_lib4, .-inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x8_lib4, @function
+inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x8_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ vmovapd 0(%r11), %ymm12
+ vbroadcastsd 0(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm0
+ vbroadcastsd 32(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm1
+ vbroadcastsd 64(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm2
+ vbroadcastsd 96(%r12), %ymm13
+ vfmadd231pd %ymm12, %ymm13, %ymm3
+ vbroadcastsd 128(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm4
+ vbroadcastsd 160(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm5
+ vbroadcastsd 192(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm6
+ vbroadcastsd 224(%r12), %ymm12 // B
+ vfmadd231pd %ymm12, %ymm13, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x8_lib4, .-inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- 4*sda*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_8x4_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // A1 <- A0
+ addq %r11, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ vbroadcastsd 0(%r12), %ymm12
+ vmovapd 0(%r10), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 32(%r10), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 32(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 40(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ vbroadcastsd 64(%r12), %ymm12
+ vmovapd 64(%r10), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 64(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 72(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 80(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vbroadcastsd 96(%r12), %ymm12
+ vmovapd 96(%r10), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 96(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 104(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 112(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 120(%r12), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ addq $128, %r10
+ addq $128, %r12
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_8x4_lib4, .-inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_8x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+#endif
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ addq $32, %r11
+ vmovapd 0(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ addq $32, %r13
+ addq $32, %r15
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ addq $32, %r11
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ addq $32, %r13
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ addq $32, %r15
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ addq $32, %r11
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ addq $32, %r13
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ addq $32, %r15
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vbroadcastsd 0(%r13), %ymm12
+ subl $1, %r10d
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm9
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ addq $32, %r11
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ addq $32, %r13
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+ addq $32, %r15
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_8x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_8x4_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r15d
+ jg 0f
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r15d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A0+3*bs*sizeof(double)
+ addq %r14, %r13
+ subq $8, %r13 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r15d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A0+2*bs*sizeof(double)
+ addq %r14, %r13
+ subq $16, %r13 // B+bs*sdb*sizeof(double)-2
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 104(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-3
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ vmovapd 32(%r11), %ymm8
+ vmovapd 32(%r11, %r12, 1), %ymm9
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 40(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 72(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ vmovapd 64(%r11), %ymm8
+ vmovapd 64(%r11, %r12, 1), %ymm9
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 48(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 80(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 112(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ vmovapd 96(%r11), %ymm8
+ vmovapd 96(%r11, %r12, 1), %ymm9
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 56(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 88(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 120(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A0+4*bs*sizeof(double)
+ addq %r14, %r13 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_8x4_lib4, .-inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A0
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// rax <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_8x4_vs_lib4, @function
+inner_edge_dtrmm_nn_rl_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_8x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ cmpl $0, %r15d
+ jg 0f // offB>0
+
+ // offB==0
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+0:
+ cmpl $1, %r15d
+ jg 1f // offB>1
+
+ // offB==1
+
+ addq $8, %r13 // B+1*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f // end
+
+1:
+ cmpl $2, %r15d
+ jg 2f // offB>2
+
+ // offB==2
+
+ addq $16, %r13 // B+2*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-2
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r13 // B+3*sizeof(double)
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq $8, %r13 // B+1*sizeof(double)
+
+ cmpl $0, %r10d
+ jle 3f // end
+
+ vmovapd 0(%r11), %ymm8
+ vmovapd 0(%r11, %r12, 1), %ymm9
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vfmadd231pd %ymm9, %ymm12, %ymm4
+ vbroadcastsd 32(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+ vfmadd231pd %ymm9, %ymm12, %ymm5
+ vbroadcastsd 64(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vfmadd231pd %ymm9, %ymm12, %ymm6
+ vbroadcastsd 96(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+ vfmadd231pd %ymm9, %ymm12, %ymm7
+
+ subl $1, %r10d // k-4
+ addq $32, %r11 // A0+1*bs*sizeof(double)
+ addq %r14, %r13
+ subq $24, %r13 // B+bs*sdb*sizeof(double)-(bs-1)*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_8x4_vs_lib4, .-inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_8x4_lib4, @function
+inner_blend_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_8x4_lib4:
+#endif
+#endif
+
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_8x4_lib4, .-inner_blend_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x4_lib4, @function
+inner_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_11_8x4_lib4:
+#endif
+#endif
+
+
+ movq %r10, %r12 // C1 <- C0
+ addq %r11, %r12 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r12), %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r12), %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r12), %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r12), %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x4_lib4, .-inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_lib4, @function
+inner_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ movq %r12, %r15 // C1 <- C0
+ addq %r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ // alg==1
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_lib4, .-inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_gen_lib4, @function
+inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ movq %r13, %rax // C1 <- C0
+ addq %r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ jmp 3f
+
+0:
+
+ movq %rax, %rbx // C0
+ addq %r14, %rbx // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%rax), %ymm13
+ vmovapd 0(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%rax), %ymm13
+ vmovapd 32(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%rax), %ymm13
+ vmovapd 64(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%rax), %ymm13
+ vmovapd 96(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%rax), %ymm13
+ vmovapd 0(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%rax), %ymm13
+ vmovapd 32(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%rax), %ymm13
+ vmovapd 64(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%rax), %ymm13
+ vmovapd 96(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%rax), %ymm13
+ vmovapd 0(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%rax), %ymm13
+ vmovapd 32(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%rax), %ymm13
+ vmovapd 64(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%rax), %ymm13
+ vmovapd 96(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_gen_lib4, .-inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_8x4_lib4, @function
+inner_scale_a0_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_8x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_8x4_lib4, .-inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_lib4, @function
+inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ movq %r12, %r15 // C1 <- C0
+ addq %r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ // alg==1
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r15), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_lib4, .-inner_blend_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x8_lib4, @function
+inner_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x8_lib4:
+#endif
+#endif
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm14
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+ vmovapd 128(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x8_lib4, .-inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_lib4, @function
+inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib4:
+#endif
+#endif
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 128(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_lib4, .-inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_gen_lib4, @function
+inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib4:
+#endif
+#endif
+
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ movq %r13, %rax // C1 <- C0
+ addq %r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovapd 0(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r13), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%rax), %ymm14
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ jmp 3f
+
+0:
+
+ movq %rax, %rbx // C0
+ addq %r14, %rbx // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $1, %r12d
+ jg 1f
+
+ // offset==1
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%rax), %ymm13
+ vmovapd 0(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%rax), %ymm13
+ vmovapd 32(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%rax), %ymm13
+ vmovapd 64(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%rax), %ymm13
+ vmovapd 96(%rbx), %ymm14
+ vblendpd $0x1, %ymm13, %ymm12, %ymm12
+ vblendpd $0x1, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm14, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm14, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r12d
+ jg 2f
+
+ // offset==2
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%rax), %ymm13
+ vmovapd 0(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%rax), %ymm13
+ vmovapd 32(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%rax), %ymm13
+ vmovapd 64(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%rax), %ymm13
+ vmovapd 96(%rbx), %ymm14
+ vblendpd $0x3, %ymm13, %ymm12, %ymm12
+ vblendpd $0x3, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd 0(%r13), %ymm12
+ vmovapd 0(%rax), %ymm13
+ vmovapd 0(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm0
+ vfmadd231pd %ymm13, %ymm15, %ymm4
+
+ vmovapd 32(%r13), %ymm12
+ vmovapd 32(%rax), %ymm13
+ vmovapd 32(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm1
+ vfmadd231pd %ymm13, %ymm15, %ymm5
+
+ vmovapd 64(%r13), %ymm12
+ vmovapd 64(%rax), %ymm13
+ vmovapd 64(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm2
+ vfmadd231pd %ymm13, %ymm15, %ymm6
+
+ vmovapd 96(%r13), %ymm12
+ vmovapd 96(%rax), %ymm13
+ vmovapd 96(%rbx), %ymm14
+ vblendpd $0x7, %ymm13, %ymm12, %ymm12
+ vblendpd $0x7, %ymm14, %ymm13, %ymm13
+ vperm2f128 $0x01, %ymm12, %ymm12, %ymm14
+ vshufpd $0x5, %ymm12, %ymm14, %ymm12
+ vperm2f128 $0x01, %ymm13, %ymm13, %ymm14
+ vshufpd $0x5, %ymm13, %ymm14, %ymm13
+ vfmadd231pd %ymm12, %ymm15, %ymm3
+ vfmadd231pd %ymm13, %ymm15, %ymm7
+
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_gen_lib4, .-inner_blend_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_lib4, @function
+inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib4:
+#endif
+#endif
+
+ // tc==n
+ vblendpd $0xa, %ymm1, %ymm0, %ymm8
+ vblendpd $0x5, %ymm1, %ymm0, %ymm9
+ vblendpd $0xa, %ymm3, %ymm2, %ymm10
+ vblendpd $0x5, %ymm3, %ymm2, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm0
+ vblendpd $0x3, %ymm10, %ymm8, %ymm2
+ vblendpd $0xc, %ymm11, %ymm9, %ymm1
+ vblendpd $0x3, %ymm11, %ymm9, %ymm3
+
+ vblendpd $0xa, %ymm5, %ymm4, %ymm8
+ vblendpd $0x5, %ymm5, %ymm4, %ymm9
+ vblendpd $0xa, %ymm7, %ymm6, %ymm10
+ vblendpd $0x5, %ymm7, %ymm6, %ymm11
+
+ vblendpd $0xc, %ymm10, %ymm8, %ymm4
+ vblendpd $0x3, %ymm10, %ymm8, %ymm6
+ vblendpd $0xc, %ymm11, %ymm9, %ymm5
+ vblendpd $0x3, %ymm11, %ymm9, %ymm7
+
+ movq %r10, %r15 // C1 <- C0
+ addq %r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovapd 0(%r10), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+
+ vmovapd 0(%r15), %ymm15
+ vaddpd %ymm4, %ymm15, %ymm4
+ vmovapd 32(%r15), %ymm15
+ vaddpd %ymm5, %ymm15, %ymm5
+ vmovapd 64(%r15), %ymm15
+ vaddpd %ymm6, %ymm15, %ymm6
+ vmovapd 96(%r15), %ymm15
+ vaddpd %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_lib4, .-inner_blend_scale_11_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// transpose and scale for alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_11_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_11_4x8_lib4, @function
+inner_tran_scale_11_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_11_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_11_4x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_11_4x8_lib4:
+#endif
+#endif
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14 // beta=1.0
+#else
+ vmovapd LC04(%rip), %ymm14 // beta=1.0
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 128(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_11_4x8_lib4, .-inner_tran_scale_11_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_8x4_vs_lib4, @function
+inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ cmpl $2, %r11d
+ jl 0f // ret
+// vperm2f128 $0x00, %ymm0, %ymm0, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ cmpl $4, %r11d
+ jl 0f // ret
+// vperm2f128 $0x11, %ymm2, %ymm2, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+// vextractf128 $0x1, %ymm3, %xmm13
+// vpermilpd $0x3, %xmm13, %xmm13
+ vpermpd $0xff, %ymm3, %ymm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_8x4_vs_lib4, .-inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x4_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x8_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x8_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x8_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm7
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm4
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm7
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm4
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm5
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm7
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm4
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm5
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm6
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm7
+
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x8_lib4, .-inner_edge_dtrsm_rlt_inv_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r11), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+
+
+ vbroadcastsd 8(%r11), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+
+ vbroadcastsd 16(%r11), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+ vbroadcastsd 24(%r11), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// r13d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- sdd
+// r12 <- inv_diag_D
+// r13d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X8_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x8_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x8_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x8_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm4
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm7
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm4
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm5
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm7
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm4
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm5
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm6
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm7
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm4
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm5
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm6
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm7
+
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ cmpl $6, %r13d
+ jl 0f // ret
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ cmpl $7, %r13d
+ jl 0f // ret
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ cmpl $8, %r13d
+ jl 0f // ret
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x8_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_8x4_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_8x4_lib4, .-inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// unit diagonal
+//
+// input arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_one_8x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_one_8x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+ jl 0f // ret
+
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+
+ cmpl $3, %r11d
+ jl 0f // ret
+
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+
+ cmpl $4, %r11d
+ jl 0f // ret
+
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_one_8x4_vs_lib4, .-inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = upper
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_8x4_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_lib4:
+#endif
+#endif
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm2
+ vfnmadd231pd %ymm7, %ymm12, %ymm6
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm1
+ vfnmadd231pd %ymm7, %ymm12, %ymm5
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm0
+ vfnmadd231pd %ymm7, %ymm12, %ymm4
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm1
+ vfnmadd231pd %ymm6, %ymm12, %ymm5
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm0
+ vfnmadd231pd %ymm6, %ymm12, %ymm4
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm0
+ vfnmadd231pd %ymm5, %ymm12, %ymm4
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_8x4_lib4, .-inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = lower
+// tran = transposed
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rut_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rut_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rut_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $3, %r12d
+ jle 0f
+
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm2
+ vfnmadd231pd %ymm7, %ymm12, %ymm6
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm1
+ vfnmadd231pd %ymm7, %ymm12, %ymm5
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm3, %ymm12, %ymm0
+ vfnmadd231pd %ymm7, %ymm12, %ymm4
+
+0:
+ cmpl $2, %r12d
+ jle 1f
+
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm1
+ vfnmadd231pd %ymm6, %ymm12, %ymm5
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm0
+ vfnmadd231pd %ymm6, %ymm12, %ymm4
+
+1:
+ cmpl $1, %r12d
+ jle 2f
+
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm0
+ vfnmadd231pd %ymm5, %ymm12, %ymm4
+
+2:
+
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rut_inv_8x4_vs_lib4, .-inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = right
+// uplo = up
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_run_inv_8x4_lib4, @function
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_run_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_run_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_run_inv_8x4_lib4:
+#endif
+#endif
+
+ // first column
+ vbroadcastsd 0(%r11), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+ // second column
+ vbroadcastsd 32(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm1
+ vfnmadd231pd %ymm4, %ymm12, %ymm5
+ vbroadcastsd 8(%r11), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+
+ // third column
+ vbroadcastsd 64(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm2
+ vfnmadd231pd %ymm4, %ymm12, %ymm6
+ vbroadcastsd 72(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm2
+ vfnmadd231pd %ymm5, %ymm12, %ymm6
+ vbroadcastsd 16(%r11), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+
+ // fourth column
+ vbroadcastsd 96(%r10), %ymm12
+ vfnmadd231pd %ymm0, %ymm12, %ymm3
+ vfnmadd231pd %ymm4, %ymm12, %ymm7
+ vbroadcastsd 104(%r10), %ymm12
+ vfnmadd231pd %ymm1, %ymm12, %ymm3
+ vfnmadd231pd %ymm5, %ymm12, %ymm7
+ vbroadcastsd 112(%r10), %ymm12
+ vfnmadd231pd %ymm2, %ymm12, %ymm3
+ vfnmadd231pd %ymm6, %ymm12, %ymm7
+ vbroadcastsd 24(%r11), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_run_inv_8x4_lib4, .-inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = lower
+// tran = normal
+// unit diagonal
+//
+// input arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E0
+// r11 <- 4*sde*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lln_one_8x4_lib4, @function
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lln_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lln_one_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lln_one_8x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r12 // E1 <- E0
+ addq %r11, %r12 // E1 <- E0 + 4*sde*sizeof(double)
+
+ // left block-column
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 0(%r10), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vmovapd 0(%r12), %ymm14
+ vpermpd $0x00, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 32(%r10), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vmovapd 32(%r12), %ymm14
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vpermpd $0x55, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+
+ vxorpd %ymm14, %ymm14, %ymm14
+ vmovapd 64(%r10), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vmovapd 64(%r12), %ymm14
+ vpermpd $0xaa, %ymm0, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm0
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vpermpd $0xaa, %ymm1, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm1
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vpermpd $0xaa, %ymm2, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm2
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm3
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+
+ vmovapd 96(%r12), %ymm14
+ vpermpd $0xff, %ymm0, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm4
+ vpermpd $0xff, %ymm1, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm5
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm6
+ vpermpd $0xff, %ymm3, %ymm13
+ vfnmadd231pd %ymm14, %ymm13, %ymm7
+
+ addq $128, %r12
+
+
+ // right block-column
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ vmovapd 0(%r12), %ymm12
+ vblendpd $0x1, %ymm14, %ymm12, %ymm12
+ vpermpd $0x00, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vpermpd $0x00, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vpermpd $0x00, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vpermpd $0x00, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+
+ vmovapd 32(%r12), %ymm12
+ vblendpd $0x3, %ymm14, %ymm12, %ymm12
+ vpermpd $0x55, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vpermpd $0x55, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vpermpd $0x55, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vpermpd $0x55, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+
+ vmovapd 64(%r12), %ymm12
+ vblendpd $0x7, %ymm14, %ymm12, %ymm12
+ vpermpd $0xaa, %ymm4, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm4
+ vpermpd $0xaa, %ymm5, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm5
+ vpermpd $0xaa, %ymm6, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm6
+ vpermpd $0xaa, %ymm7, %ymm13
+ vfnmadd231pd %ymm12, %ymm13, %ymm7
+
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lln_one_8x4_lib4, .-inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_8x4_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r13 // E1 <- E0
+ addq %r11, %r13 // E1 <- E0 + 4*sde*sizeof(double)
+
+ // bottom-right
+
+ vmovapd 224(%r13), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm11
+
+ vpermpd $0xff, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r13), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm11
+
+ vpermpd $0xaa, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r13), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm11
+
+ vpermpd $0x55, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0x55, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0x55, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0x55, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm11
+
+ vpermpd $0x00, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0x00, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0x00, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0x00, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vpermpd $0xff, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vpermpd $0xaa, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_8x4_lib4, .-inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution:
+// side = left
+// uplo = upper
+// tran = normal
+// requires explicit inverse of diagonal
+//
+// input arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- 4*sde*sizeof(double)
+// r12 <- inv_diag_E
+// r13 <- km
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_lun_inv_8x4_vs_lib4, @function
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_lun_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_lun_inv_8x4_vs_lib4:
+#endif
+#endif
+
+ movq %r10, %r14 // E1 <- E0
+ addq %r11, %r14 // E1 <- E0 + 4*sde*sizeof(double)
+
+ // bottom-right
+
+ cmpl $7, %r13d
+ jle 0f
+
+ vmovapd 224(%r14), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 56(%r12), %ymm12
+ vmovapd 224(%r10), %ymm11
+
+ vpermpd $0xff, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+0:
+ cmpl $6, %r13d
+ jle 1f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0 // ?????????????
+ vmovapd 192(%r14), %xmm13
+ vbroadcastsd 48(%r12), %ymm12
+ vmovapd 192(%r10), %ymm11
+
+ vpermpd $0xaa, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+1:
+ cmpl $5, %r13d
+ jle 2f
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 160(%r14), %xmm13
+ vbroadcastsd 40(%r12), %ymm12
+ vmovapd 160(%r10), %ymm11
+
+ vpermpd $0x55, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0x55, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0x55, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0x55, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+2:
+
+ vbroadcastsd 32(%r12), %ymm12
+ vmovapd 128(%r10), %ymm11
+
+ vpermpd $0x00, %ymm4, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm4, %ymm4
+ vfnmadd231pd %ymm11, %ymm14, %ymm0
+
+ vpermpd $0x00, %ymm5, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm5, %ymm5
+ vfnmadd231pd %ymm11, %ymm14, %ymm1
+
+ vpermpd $0x00, %ymm6, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm6, %ymm6
+ vfnmadd231pd %ymm11, %ymm14, %ymm2
+
+ vpermpd $0x00, %ymm7, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm7, %ymm7
+ vfnmadd231pd %ymm11, %ymm14, %ymm3
+
+
+ // top-left
+
+ vmovapd 96(%r10), %ymm13
+ vxorpd %ymm14, %ymm14, %ymm14 // 0.0
+ vblendpd $0x7, %ymm13, %ymm14, %ymm13
+ vbroadcastsd 24(%r12), %ymm12
+
+ vpermpd $0xff, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xff, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xff, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xff, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x8, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovapd 64(%r10), %xmm13
+ vbroadcastsd 16(%r12), %ymm12
+
+ vpermpd $0xaa, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermpd $0xaa, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermpd $0xaa, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermpd $0xaa, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x4, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vxorpd %ymm13, %ymm13, %ymm13 // 0.0
+ vmovsd 32(%r10), %xmm13
+ vbroadcastsd 8(%r12), %ymm12
+
+ vpermilpd $0xf, %ymm0, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm0, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm0
+
+ vpermilpd $0xf, %ymm1, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm1, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm1
+
+ vpermilpd $0xf, %ymm2, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm2, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm2
+
+ vpermilpd $0xf, %ymm3, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm14
+ vblendpd $0x2, %ymm14, %ymm3, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm3
+
+
+ vbroadcastsd 0(%r12), %ymm12
+
+ vmulpd %ymm0, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm0, %ymm0
+
+ vmulpd %ymm1, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+
+ vmulpd %ymm2, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm2, %ymm2
+
+ vmulpd %ymm3, %ymm12, %ymm14
+ vblendpd $0x1, %ymm14, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_lun_inv_8x4_vs_lib4, .-inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// LU factorization without pivoting
+// left kernel
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgetrf_l_8x4_lib4, @function
+inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgetrf_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgetrf_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgetrf_l_8x4_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+// vmovddup %xmm14, %xmm14
+
+ // first column
+// vblendpd $0x1, %ymm0, %ymm12, %ymm12
+ vmovapd %ymm0, %ymm12
+ vdivsd %xmm0, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 0(%r10)
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vblendpd $0x1, %ymm12, %ymm0, %ymm0
+
+ // second column
+ vpermpd $0x00, %ymm1, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vblendpd $0x2, %ymm1, %ymm13, %ymm12
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 8(%r10)
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vblendpd $0x3, %ymm12, %ymm1, %ymm1
+
+ // third column
+ vpermpd $0x00, %ymm2, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vblendpd $0x2, %ymm2, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm2, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vblendpd $0x4, %ymm2, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm2, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 16(%r10)
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vblendpd $0x7, %ymm12, %ymm2, %ymm2
+
+ // fourth column
+ vpermpd $0x00, %ymm3, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vblendpd $0x2, %ymm3, %ymm13, %ymm12
+
+ vpermpd $0x55, %ymm3, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vblendpd $0x4, %ymm3, %ymm12, %ymm12
+
+ vpermpd $0xaa, %ymm3, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vblendpd $0x8, %ymm3, %ymm12, %ymm12
+
+ vpermpd $0xff, %ymm3, %ymm13
+ vdivsd %xmm13, %xmm14, %xmm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmovsd %xmm13, 24(%r10)
+// vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vblendpd $0x7, %ymm12, %ymm3, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgetrf_l_8x4_lib4, .-inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_lib4, @function
+inner_store_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r15)
+ vmovapd %ymm5, 32(%r15)
+ vmovapd %ymm6, 64(%r15)
+ vmovapd %ymm7, 96(%r15)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_lib4, .-inner_store_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_lib4, @function
+inner_store_4x8_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 128(%r10)
+ vmovapd %ymm5, 160(%r10)
+ vmovapd %ymm6, 192(%r10)
+ vmovapd %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_lib4, .-inner_store_4x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_vs_lib4, @function
+inner_store_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ cmpl $2, %r13d
+ vmovapd %ymm0, 0(%r10)
+ vmaskmovpd %ymm4, %ymm15, 0(%r15)
+ jl 0f // end
+ cmpl $3, %r13d
+ vmovapd %ymm1, 32(%r10)
+ vmaskmovpd %ymm5, %ymm15, 32(%r15)
+ jl 0f // end
+ vmovapd %ymm2, 64(%r10)
+ vmaskmovpd %ymm6, %ymm15, 64(%r15)
+ je 0f // end
+ vmovapd %ymm3, 96(%r10)
+ vmaskmovpd %ymm7, %ymm15, 96(%r15)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_vs_lib4, .-inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_vs_lib4, @function
+inner_store_4x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmaskmovpd %ymm0, %ymm15, 0(%r10)
+ vmaskmovpd %ymm1, %ymm15, 32(%r10)
+ vmaskmovpd %ymm2, %ymm15, 64(%r10)
+ vmaskmovpd %ymm3, %ymm15, 96(%r10)
+
+ vmaskmovpd %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovpd %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmaskmovpd %ymm7, %ymm15, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_vs_lib4, .-inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_lib4, @function
+inner_store_l_8x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib4:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+ vmovapd %ymm0,0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r15)
+ vmovapd %ymm5, 32(%r15)
+ vmovapd %ymm6, 64(%r15)
+ vmovapd %ymm7, 96(%r15)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_lib4, .-inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_vs_lib4, @function
+inner_store_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmovapd %ymm0, 0(%r10)
+ vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+ vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
+
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_vs_lib4, .-inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_gen_lib4, @function
+inner_store_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+ vmovupd .LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+ vmovupd LC03(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm3, %ymm2
+ vmovapd %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmaskmovpd %ymm0, %ymm14, 0(%r11)
+ vmaskmovpd %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm1, %ymm14, 32(%r11)
+ vmaskmovpd %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm14, 64(%r11)
+ vmaskmovpd %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm14, 96(%r11)
+ vmaskmovpd %ymm7, %ymm15, 96(%r11, %r12, 1)
+
+ jmp 4f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm12
+ vshufpd $0x5, %ymm4, %ymm12, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm12
+ vshufpd $0x5, %ymm5, %ymm12, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm12
+ vshufpd $0x5, %ymm6, %ymm12, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm12
+ vshufpd $0x5, %ymm7, %ymm12, %ymm7
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x1, %ymm14, %ymm15, %ymm14
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x3, %ymm14, %ymm15, %ymm14
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x21, %ymm0, %ymm4, %ymm12
+ vshufpd $0x5, %ymm12, %ymm4, %ymm0
+ vperm2f128 $0x21, %ymm4, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x21, %ymm1, %ymm5, %ymm12
+ vshufpd $0x5, %ymm12, %ymm5, %ymm1
+ vperm2f128 $0x21, %ymm5, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x21, %ymm2, %ymm6, %ymm12
+ vshufpd $0x5, %ymm12, %ymm6, %ymm2
+ vperm2f128 $0x21, %ymm6, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x21, %ymm3, %ymm7, %ymm12
+ vshufpd $0x5, %ymm12, %ymm7, %ymm3
+ vperm2f128 $0x21, %ymm7, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm12, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x7, %ymm14, %ymm15, %ymm14
+
+3:
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 4f // end
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+4:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib4, .-inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store l generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_gen_lib4, @function
+inner_store_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+ vmovupd .LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+ vmovupd LC03(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm3, %ymm2
+ vmovapd %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm13
+#endif
+
+ vmaskmovpd %ymm0, %ymm14, 0(%r11)
+ vmaskmovpd %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm1, %ymm14, 32(%r11)
+ vmaskmovpd %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm2, %ymm14, 64(%r11)
+ vmaskmovpd %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 3f // end
+ vblendpd $0x4, %ymm13, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm14, 96(%r11)
+ vmaskmovpd %ymm7, %ymm15, 96(%r11, %r12, 1)
+
+ jmp 3f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm12
+ vshufpd $0x5, %ymm4, %ymm12, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm12
+ vshufpd $0x5, %ymm5, %ymm12, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm12
+ vshufpd $0x5, %ymm6, %ymm12, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm12
+ vshufpd $0x5, %ymm7, %ymm12, %ymm7
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC08(%rip), %ymm12
+ vmovupd .LC05(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC08(%rip), %ymm12
+ vmovupd LC05(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x1, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x2, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC09(%rip), %ymm12
+ vmovupd .LC06(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC09(%rip), %ymm12
+ vmovupd LC06(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x3, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x4, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x21, %ymm0, %ymm4, %ymm12
+ vshufpd $0x5, %ymm12, %ymm4, %ymm0
+ vperm2f128 $0x21, %ymm4, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x21, %ymm1, %ymm5, %ymm12
+ vshufpd $0x5, %ymm12, %ymm5, %ymm1
+ vperm2f128 $0x21, %ymm5, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x21, %ymm2, %ymm6, %ymm12
+ vshufpd $0x5, %ymm12, %ymm6, %ymm2
+ vperm2f128 $0x21, %ymm6, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x21, %ymm3, %ymm7, %ymm12
+ vshufpd $0x5, %ymm12, %ymm7, %ymm3
+ vperm2f128 $0x21, %ymm7, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm7
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm12, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC10(%rip), %ymm12
+ vmovupd .LC07(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC10(%rip), %ymm12
+ vmovupd LC07(%rip), %ymm13
+#endif
+ vandpd %ymm12, %ymm14, %ymm12
+ vandpd %ymm13, %ymm15, %ymm13
+
+ vblendpd $0x7, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm15
+#elif defined(OS_MAC)
+ vmovapd LC04(%rip), %ymm15
+#endif
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 3f // end
+ vblendpd $0x8, %ymm15, %ymm12, %ymm12
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 3f // end
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ je 3f // end
+ vblendpd $0x2, %ymm15, %ymm14, %ymm14
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_gen_lib4, .-inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgemm_nt_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_lib4
+ .type kernel_dgemm_nt_8x4_lib4, @function
+kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_lib4
+_kernel_dgemm_nt_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_lib4
+ .def kernel_dgemm_nt_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_lib4, .-kernel_dgemm_nt_8x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8
+// void kernel_dgemm_nt_4x8_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x8_lib4
+ .type kernel_dgemm_nt_4x8_lib4, @function
+kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x8_lib4
+_kernel_dgemm_nt_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x8_lib4
+ .def kernel_dgemm_nt_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x8_lib4, .-kernel_dgemm_nt_4x8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nt_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_vs_lib4
+ .type kernel_dgemm_nt_8x4_vs_lib4, @function
+kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_vs_lib4
+_kernel_dgemm_nt_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_vs_lib4
+ .def kernel_dgemm_nt_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_vs_lib4, .-kernel_dgemm_nt_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dgemm_nt_4x8_vs_lib4(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x8_vs_lib4
+ .type kernel_dgemm_nt_4x8_vs_lib4, @function
+kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x8_vs_lib4
+_kernel_dgemm_nt_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x8_vs_lib4
+ .def kernel_dgemm_nt_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG5, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG3, %r13 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // km
+ movq ARG10, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x8_vs_lib4, .-kernel_dgemm_nt_4x8_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dgemm_nt_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x4_gen_lib4
+ .type kernel_dgemm_nt_8x4_gen_lib4, @function
+kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x4_gen_lib4
+_kernel_dgemm_nt_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x4_gen_lib4
+ .def kernel_dgemm_nt_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x4_gen_lib4, .-kernel_dgemm_nt_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_nn_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_8x4_lib4
+ .type kernel_dgemm_nn_8x4_lib4, @function
+kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_8x4_lib4
+_kernel_dgemm_nn_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_8x4_lib4
+ .def kernel_dgemm_nn_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_8x4_lib4, .-kernel_dgemm_nn_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nn_4x8_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x8_lib4
+ .type kernel_dgemm_nn_4x8_lib4, @function
+kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x8_lib4
+_kernel_dgemm_nn_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x8_lib4
+ .def kernel_dgemm_nn_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x8_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x8_lib4, .-kernel_dgemm_nn_4x8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88 rsp+96
+// void kernel_dgemm_nn_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_8x4_gen_lib4
+ .type kernel_dgemm_nn_8x4_gen_lib4, @function
+kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_8x4_gen_lib4
+_kernel_dgemm_nn_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_8x4_gen_lib4
+ .def kernel_dgemm_nn_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // offsetC
+ movq ARG10, %r13 // C
+ movq ARG11, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG12, %r10 // offsetD
+ movq ARG13, %r11 // D
+ movq ARG14, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG15, %r13 // m0
+ movq ARG16, %r14 // m1
+ movq ARG17, %r15 // n0
+ movq ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_8x4_gen_lib4, .-kernel_dgemm_nn_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dsyrk_nt_l_8x4_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_lib4
+ .type kernel_dsyrk_nt_l_8x4_lib4, @function
+kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_lib4
+_kernel_dsyrk_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_lib4
+ .def kernel_dsyrk_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_lib4, .-kernel_dsyrk_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dsyrk_nt_l_8x4_vs_lib4(int km, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_vs_lib4
+ .type kernel_dsyrk_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_vs_lib4
+_kernel_dsyrk_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_vs_lib4
+ .def kernel_dsyrk_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_vs_lib4, .-kernel_dsyrk_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dsyrk_nt_l_8x4_gen_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x4_gen_lib4
+ .type kernel_dsyrk_nt_l_8x4_gen_lib4, @function
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x4_gen_lib4
+_kernel_dsyrk_nt_l_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x4_gen_lib4
+ .def kernel_dsyrk_nt_l_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x4_gen_lib4, .-kernel_dsyrk_nt_l_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nn_rl_8x4_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_8x4_lib4
+ .type kernel_dtrmm_nn_rl_8x4_lib4, @function
+kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_8x4_lib4
+_kernel_dtrmm_nn_rl_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_8x4_lib4
+ .def kernel_dtrmm_nn_rl_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_8x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_8x4_lib4, .-kernel_dtrmm_nn_rl_8x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dtrmm_nn_rl_8x4_vs_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_8x4_vs_lib4
+ .type kernel_dtrmm_nn_rl_8x4_vs_lib4, @function
+kernel_dtrmm_nn_rl_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_8x4_vs_lib4
+_kernel_dtrmm_nn_rl_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_8x4_vs_lib4
+ .def kernel_dtrmm_nn_rl_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdb*sizeof(double)
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_8x4_vs_lib4, .-kernel_dtrmm_nn_rl_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_dtrmm_nn_rl_8x4_gen_lib4(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+ .type kernel_dtrmm_nn_rl_8x4_gen_lib4, @function
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_8x4_gen_lib4
+_kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_8x4_gen_lib4
+ .def kernel_dtrmm_nn_rl_8x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_8x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_8x4_vs_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // offsetD
+ movq ARG9, %r11 // D
+ movq ARG10, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG11, %r13 // m0
+ movq ARG12, %r14 // m1
+ movq ARG13, %r15 // n0
+ movq ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_8x4_gen_lib4, .-kernel_dtrmm_nn_rl_8x4_gen_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_8x4_lib4
+ .type kernel_dtrmm_nt_ru_8x4_lib4, @function
+kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_8x4_lib4
+_kernel_dtrmm_nt_ru_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_8x4_lib4
+ .def kernel_dtrmm_nt_ru_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10 // A
+ movq ARG4, %r11 // sda
+ sall $5, %r11d // 4*sda*sizeof(double)
+ movq ARG5, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_8x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_8x4_lib4, .-kernel_dtrmm_nt_ru_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrmm_nt_ru_8x4_lib4(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_8x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_8x4_vs_lib4
+_kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_8x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d //k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ addq $128, %r13 // B+4*bs
+
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+// INNER_BLEND_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+// call inner_blend_8x4_lib4
+#elif defined(OS_MAC)
+// callq _inner_blend_8x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movq ARG8, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib4
+#endif
+#endif
+
+
+// store n
+
+ movq ARG9, %r10 // store address D
+ movq ARG10, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_8x4_vs_lib4, .-kernel_dtrmm_nt_ru_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dpotrf_nt_l_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x4_lib4
+ .type kernel_dpotrf_nt_l_8x4_lib4, @function
+kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x4_lib4
+_kernel_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x4_lib4
+ .def kernel_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x4_lib4, .-kernel_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dpotrf_nt_l_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x4_vs_lib4
+ .type kernel_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x4_vs_lib4
+ .def kernel_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dsyrk_dpotrf_nt_l_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x8_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x8_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x8_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x8_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG3, %r11
+ movq ARG4, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG2, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG9, %r12 // inv_diag_E
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG6, %r10 // store address D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x8_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x8_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4(int kp, double *Ap, double *Bp, int sdbp, int km, double *Am, double *Bm, int sdbm, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG3, %r11 // Bp
+ movq ARG4, %r12 // sdbp
+ sall $5, %r12d // 32*sdbp
+ movq ARG2, %r13 // Ap
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG7, %r11 // Bm
+ movq ARG8, %r12 // sdbm
+ sall $5, %r12d // 32*sdbm
+ movq ARG6, %r13 // Am
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG11, %r10 // E
+ movq ARG12, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG13, %r12 // inv_diag_E
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x8_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // store address D
+ movq ARG14, %r11 // km
+ movq ARG15, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_rl_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dtrsm_nt_rl_inv_4x8_lib4(int k, double *A, double *B, int sdb, double *C, double *D, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x8_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x8_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x8_lib4
+_kernel_dtrsm_nt_rl_inv_4x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x8_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x8_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG3, %r11
+ movq ARG4, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG2, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG7, %r10 // E
+ movq ARG8, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG9, %r12 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG6, %r10 // store address D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x8_lib4, .-kernel_dtrsm_nt_rl_inv_4x8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_one_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_8x4_lib4
+ .type kernel_dtrsm_nt_rl_one_8x4_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_8x4_lib4
+_kernel_dtrsm_nt_rl_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_8x4_lib4
+ .def kernel_dtrsm_nt_rl_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_8x4_lib4, .-kernel_dtrsm_nt_rl_one_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_one_8x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+_kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_one_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_ONE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_one_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_one_8x4_vs_lib4, .-kernel_dtrsm_nt_rl_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dtrsm_nt_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+ .type kernel_dtrsm_nt_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_8x4_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_lib4
+ .def kernel_dtrsm_nt_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_8x4_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nt_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_ru_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUT_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rut_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // store address D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nt_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ru_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+ .type kernel_dtrsm_nn_ru_inv_8x4_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_8x4_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_lib4
+ .def kernel_dtrsm_nn_ru_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_8x4_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56
+// void kernel_dtrsm_nn_ru_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nn_ru_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ru_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_run_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_run_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ru_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_ru_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dtrsm_nn_ll_one_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_8x4_lib4
+ .type kernel_dtrsm_nn_ll_one_8x4_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_8x4_lib4
+_kernel_dtrsm_nn_ll_one_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_8x4_lib4
+ .def kernel_dtrsm_nn_ll_one_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_8x4_lib4, .-kernel_dtrsm_nn_ll_one_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 tsp+56
+// void kernel_dtrsm_nn_ll_one_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+ .type kernel_dtrsm_nn_ll_one_8x4_vs_lib4, @function
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+_kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+ .def kernel_dtrsm_nn_ll_one_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_ll_one_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LLN_ONE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lln_one_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lln_one_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_ll_one_8x4_vs_lib4, .-kernel_dtrsm_nn_ll_one_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dtrsm_nn_lu_inv_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+ .type kernel_dtrsm_nn_lu_inv_8x4_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_8x4_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_lib4
+ .def kernel_dtrsm_nn_lu_inv_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_8x4_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64
+// void kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+ .type kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, @function
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+_kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+ .def kernel_dtrsm_nn_lu_inv_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nn_lu_inv_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+ movq ARG13, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_LUN_INV_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_lun_inv_8x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nn_lu_inv_8x4_vs_lib4, .-kernel_dtrsm_nn_lu_inv_8x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgetrf_nn_8x4_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_8x4_lib4
+ .type kernel_dgetrf_nn_l_8x4_lib4, @function
+kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_8x4_lib4
+_kernel_dgetrf_nn_l_8x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_8x4_lib4
+ .def kernel_dgetrf_nn_l_8x4_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib4
+#endif
+#endif
+
+
+ // epilogue
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_8x4_lib4, .-kernel_dgetrf_nn_l_8x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgetrf_nn_8x4_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgetrf_nn_l_8x4_vs_lib4
+ .type kernel_dgetrf_nn_l_8x4_vs_lib4, @function
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgetrf_nn_l_8x4_vs_lib4
+_kernel_dgetrf_nn_l_8x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgetrf_nn_l_8x4_vs_lib4
+ .def kernel_dgetrf_nn_l_8x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgetrf_nn_l_8x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nn_8x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGETRF_L_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgetrf_l_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgetrf_l_8x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // km
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgetrf_nn_l_8x4_vs_lib4, .-kernel_dgetrf_nn_l_8x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5
+// void kernel_dlarfb4_r_8_lib4(int kmax, double *pV, double *pT, double *pD, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dlarfb4_r_8_lib4
+ .type kernel_dlarfb4_r_8_lib4, @function
+kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dlarfb4_r_8_lib4
+_kernel_dlarfb4_r_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dlarfb4_r_8_lib4
+ .def kernel_dlarfb4_r_8_lib4; .scl 2; .type 32; .endef
+kernel_dlarfb4_r_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+// vxorpd %ymm0, %ymm0, %ymm0
+// vmovapd %ymm0, %ymm1
+// vmovapd %ymm0, %ymm2
+// vmovapd %ymm0, %ymm3
+// vmovapd %ymm0, %ymm4
+// vmovapd %ymm0, %ymm5
+// vmovapd %ymm0, %ymm6
+// vmovapd %ymm0, %ymm7
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // D
+ movq ARG5, %r12 // sdd
+ sall $5, %r12d
+ movq ARG2, %r13 // V
+
+ //
+ vmovapd 0(%r11), %ymm0
+ vmovapd 0(%r11, %r12, 1), %ymm4
+ //
+ vmovapd 32(%r11), %ymm1
+ vmovapd 32(%r11, %r12, 1), %ymm5
+ vbroadcastsd 32(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm1, %ymm0
+ vfmadd231pd %ymm13, %ymm5, %ymm4
+ //
+ vmovapd 64(%r11), %ymm2
+ vmovapd 64(%r11, %r12, 1), %ymm6
+ vbroadcastsd 64(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm2, %ymm0
+ vfmadd231pd %ymm13, %ymm6, %ymm4
+ vbroadcastsd 72(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm2, %ymm1
+ vfmadd231pd %ymm13, %ymm6, %ymm5
+ //
+ vmovapd 96(%r11), %ymm3
+ vmovapd 96(%r11, %r12, 1), %ymm7
+ vbroadcastsd 96(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm0
+ vfmadd231pd %ymm13, %ymm7, %ymm4
+ vbroadcastsd 104(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm1
+ vfmadd231pd %ymm13, %ymm7, %ymm5
+ vbroadcastsd 112(%r13), %ymm13
+ vfmadd231pd %ymm13, %ymm3, %ymm2
+ vfmadd231pd %ymm13, %ymm7, %ymm6
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x4_lib4
+#endif
+#endif
+
+ movq ARG3, %r10 // T
+
+ //
+ vbroadcastsd 120(%r10), %ymm12
+ vmulpd %ymm3, %ymm12, %ymm3
+ vmulpd %ymm7, %ymm12, %ymm7
+ //
+ vbroadcastsd 112(%r10), %ymm12
+ vfmadd231pd %ymm2, %ymm12, %ymm3
+ vfmadd231pd %ymm6, %ymm12, %ymm7
+ vbroadcastsd 80(%r10), %ymm12
+ vmulpd %ymm2, %ymm12, %ymm2
+ vmulpd %ymm6, %ymm12, %ymm6
+ //
+ vbroadcastsd 104(%r10), %ymm12
+ vfmadd231pd %ymm1, %ymm12, %ymm3
+ vfmadd231pd %ymm5, %ymm12, %ymm7
+ vbroadcastsd 72(%r10), %ymm12
+ vfmadd231pd %ymm1, %ymm12, %ymm2
+ vfmadd231pd %ymm5, %ymm12, %ymm6
+ vbroadcastsd 40(%r10), %ymm12
+ vmulpd %ymm1, %ymm12, %ymm1
+ vmulpd %ymm5, %ymm12, %ymm5
+ //
+ vbroadcastsd 96(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm3
+ vfmadd231pd %ymm4, %ymm12, %ymm7
+ vbroadcastsd 64(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm2
+ vfmadd231pd %ymm4, %ymm12, %ymm6
+ vbroadcastsd 32(%r10), %ymm12
+ vfmadd231pd %ymm0, %ymm12, %ymm1
+ vfmadd231pd %ymm4, %ymm12, %ymm5
+ vbroadcastsd 0(%r10), %ymm12
+ vmulpd %ymm0, %ymm12, %ymm0
+ vmulpd %ymm4, %ymm12, %ymm4
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // V
+ movq ARG4, %r12 // D
+ movq ARG5, %r13 // sdd
+ sall $5, %r13d
+
+ //
+ vmovapd 0(%r12), %ymm12
+ vmovapd 0(%r12, %r13, 1), %ymm14
+ vaddpd %ymm12, %ymm0, %ymm12
+ vaddpd %ymm14, %ymm4, %ymm14
+ vmovapd %ymm12, 0(%r12)
+ vmovapd %ymm14, 0(%r12, %r13, 1)
+ //
+ vmovapd 32(%r12), %ymm12
+ vmovapd 32(%r12, %r13, 1), %ymm14
+ vbroadcastsd 32(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vaddpd %ymm12, %ymm1, %ymm12
+ vaddpd %ymm14, %ymm5, %ymm14
+ vmovapd %ymm12, 32(%r12)
+ vmovapd %ymm14, 32(%r12, %r13, 1)
+ //
+ vmovapd 64(%r12), %ymm12
+ vmovapd 64(%r12, %r13, 1), %ymm14
+ vbroadcastsd 64(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd 72(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vaddpd %ymm12, %ymm2, %ymm12
+ vaddpd %ymm14, %ymm6, %ymm14
+ vmovapd %ymm12, 64(%r12)
+ vmovapd %ymm14, 64(%r12, %r13, 1)
+ //
+ vmovapd 96(%r12), %ymm12
+ vmovapd 96(%r12, %r13, 1), %ymm14
+ vbroadcastsd 96(%r11), %ymm13
+ vfmadd231pd %ymm0, %ymm13, %ymm12
+ vfmadd231pd %ymm4, %ymm13, %ymm14
+ vbroadcastsd 104(%r11), %ymm13
+ vfmadd231pd %ymm1, %ymm13, %ymm12
+ vfmadd231pd %ymm5, %ymm13, %ymm14
+ vbroadcastsd 112(%r11), %ymm13
+ vfmadd231pd %ymm2, %ymm13, %ymm12
+ vfmadd231pd %ymm6, %ymm13, %ymm14
+ vaddpd %ymm12, %ymm3, %ymm12
+ vaddpd %ymm14, %ymm7, %ymm14
+ vmovapd %ymm12, 96(%r12)
+ vmovapd %ymm14, 96(%r12, %r13, 1)
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEBP_ADD_NN_8X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgebp_add_nn_8x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgebp_add_nn_8x4_lib4
+#endif
+#endif
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dlarfb4_r_8_lib4, .-kernel_dlarfb4_r_8_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgemm_8x8_lib4.S b/kernel/avx2/kernel_dgemm_8x8_lib4.S
new file mode 100644
index 0000000..954c96d
--- /dev/null
+++ b/kernel/avx2/kernel_dgemm_8x8_lib4.S
@@ -0,0 +1,5625 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define ARG19 STACKSIZE + 104(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define ARG19 STACKSIZE + 152(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_8x8_lib4, @function
+inner_kernel_dgemm_add_nt_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_8x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_8x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm12
+ vmovapd 0(%r11, %r12, 1), %ymm13
+ vbroadcastsd 0(%r13), %ymm14
+ vbroadcastsd 0(%r13, %r14, 1), %ymm15
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ subl $4, %r10d
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 8(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 8(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 16(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 16(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 24(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 24(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 32(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 32(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 32(%r11, %r12, 1), %ymm13
+ vbroadcastsd 32(%r13, %r14, 1), %ymm15
+
+ // unroll 1
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 40(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 40(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 48(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 48(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 56(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 56(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 64(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 64(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 64(%r11, %r12, 1), %ymm13
+ vbroadcastsd 64(%r13, %r14, 1), %ymm15
+
+ // unroll 2
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 72(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 72(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 80(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 80(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 88(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 88(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 96(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 96(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 96(%r11, %r12, 1), %ymm13
+ vbroadcastsd 96(%r13, %r14, 1), %ymm15
+
+ // unroll 3
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 104(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 104(%r13, %r14, 1), %ymm15
+ addq $128, %r11
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 112(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 112(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 120(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 120(%r13, %r14, 1), %ymm15
+ addq $128, %r13
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 0(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 0(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 0(%r11, %r12, 1), %ymm13
+ vbroadcastsd 0(%r13, %r14, 1), %ymm15
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ subl $4, %r10d
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 8(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 8(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 16(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 16(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 24(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 24(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 32(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 32(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 32(%r11, %r12, 1), %ymm13
+ vbroadcastsd 32(%r13, %r14, 1), %ymm15
+
+ // unroll 1
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 40(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 40(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 48(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 48(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 56(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 56(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 64(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 64(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 64(%r11, %r12, 1), %ymm13
+ vbroadcastsd 64(%r13, %r14, 1), %ymm15
+
+ // unroll 2
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 72(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 72(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 80(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 80(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 88(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 88(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 96(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 96(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 96(%r11, %r12, 1), %ymm13
+ vbroadcastsd 96(%r13, %r14, 1), %ymm15
+
+ // unroll 3
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 104(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 104(%r13, %r14, 1), %ymm15
+ addq $128, %r11
+
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 112(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 112(%r13, %r14, 1), %ymm15
+
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 120(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 120(%r13, %r14, 1), %ymm15
+ addq $128, %r13
+
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+// vmovapd 0(%r11), %ymm12
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+// vbroadcastsd 0(%r13), %ymm14
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+// vmovapd 0(%r11, %r12, 1), %ymm13
+// vbroadcastsd 0(%r13, %r14, 1), %ymm15
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm12
+ vmovapd 0(%r11, %r12, 1), %ymm13
+ vbroadcastsd 0(%r13), %ymm14
+ vfmadd231pd %ymm12, %ymm14, %ymm0
+ vfmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 0(%r13, %r14, 1), %ymm15
+ vfmadd231pd %ymm13, %ymm15, %ymm8
+ subl $1, %r10d
+
+ vbroadcastsd 8(%r13), %ymm14
+ vfmadd231pd %ymm12, %ymm14, %ymm1
+ vfmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 8(%r13, %r14, 1), %ymm15
+ vfmadd231pd %ymm13, %ymm15, %ymm9
+ addq $32, %r11
+
+ vbroadcastsd 16(%r13), %ymm14
+ vfmadd231pd %ymm12, %ymm14, %ymm2
+ vfmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 16(%r13, %r14, 1), %ymm15
+ vfmadd231pd %ymm13, %ymm15, %ymm10
+ addq $32, %r13
+
+ vbroadcastsd -8(%r13), %ymm14
+ vfmadd231pd %ymm12, %ymm14, %ymm3
+ vfmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd -8(%r13, %r14, 1), %ymm15
+ vfmadd231pd %ymm13, %ymm15, %ymm11
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_8x8_lib4, .-inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_8x8_lib4, @function
+inner_kernel_dgemm_sub_nt_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_8x8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_8x8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovapd 0(%r11), %ymm12
+ vmovapd 0(%r11, %r12, 1), %ymm13
+ vbroadcastsd 0(%r13), %ymm14
+ vbroadcastsd 0(%r13, %r14, 1), %ymm15
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ subl $4, %r10d
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 8(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 8(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 16(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 16(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 24(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 24(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 32(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 32(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 32(%r11, %r12, 1), %ymm13
+ vbroadcastsd 32(%r13, %r14, 1), %ymm15
+
+ // unroll 1
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 40(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 40(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 48(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 48(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 56(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 56(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 64(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 64(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 64(%r11, %r12, 1), %ymm13
+ vbroadcastsd 64(%r13, %r14, 1), %ymm15
+
+ // unroll 2
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 72(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 72(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 80(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 80(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 88(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 88(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 96(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 96(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 96(%r11, %r12, 1), %ymm13
+ vbroadcastsd 96(%r13, %r14, 1), %ymm15
+
+ // unroll 3
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 104(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 104(%r13, %r14, 1), %ymm15
+ addq $128, %r11
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 112(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 112(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 120(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 120(%r13, %r14, 1), %ymm15
+ addq $128, %r13
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 0(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 0(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 0(%r11, %r12, 1), %ymm13
+ vbroadcastsd 0(%r13, %r14, 1), %ymm15
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+ // unroll 0
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ subl $4, %r10d
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 8(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 8(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 16(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 16(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 24(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 24(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 32(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 32(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 32(%r11, %r12, 1), %ymm13
+ vbroadcastsd 32(%r13, %r14, 1), %ymm15
+
+ // unroll 1
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 40(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 40(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 48(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 48(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 56(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 56(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 64(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 64(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 64(%r11, %r12, 1), %ymm13
+ vbroadcastsd 64(%r13, %r14, 1), %ymm15
+
+ // unroll 2
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 72(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 72(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 80(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 80(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 88(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 88(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vmovapd 96(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd 96(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+ vmovapd 96(%r11, %r12, 1), %ymm13
+ vbroadcastsd 96(%r13, %r14, 1), %ymm15
+
+ // unroll 3
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 104(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ vbroadcastsd 104(%r13, %r14, 1), %ymm15
+ addq $128, %r11
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 112(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ vbroadcastsd 112(%r13, %r14, 1), %ymm15
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 120(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ vbroadcastsd 120(%r13, %r14, 1), %ymm15
+ addq $128, %r13
+
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+// vmovapd 0(%r11), %ymm12
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+// vbroadcastsd 0(%r13), %ymm14
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+// vmovapd 0(%r11, %r12, 1), %ymm13
+// vbroadcastsd 0(%r13, %r14, 1), %ymm15
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm12
+ vmovapd 0(%r11, %r12, 1), %ymm13
+ vbroadcastsd 0(%r13), %ymm14
+ vfnmadd231pd %ymm12, %ymm14, %ymm0
+ vfnmadd231pd %ymm13, %ymm14, %ymm4
+ vbroadcastsd 0(%r13, %r14, 1), %ymm15
+ vfnmadd231pd %ymm13, %ymm15, %ymm8
+ subl $1, %r10d
+
+ vbroadcastsd 8(%r13), %ymm14
+ vfnmadd231pd %ymm12, %ymm14, %ymm1
+ vfnmadd231pd %ymm13, %ymm14, %ymm5
+ vbroadcastsd 8(%r13, %r14, 1), %ymm15
+ vfnmadd231pd %ymm13, %ymm15, %ymm9
+ addq $32, %r11
+
+ vbroadcastsd 16(%r13), %ymm14
+ vfnmadd231pd %ymm12, %ymm14, %ymm2
+ vfnmadd231pd %ymm13, %ymm14, %ymm6
+ vbroadcastsd 16(%r13, %r14, 1), %ymm15
+ vfnmadd231pd %ymm13, %ymm15, %ymm10
+ addq $32, %r13
+
+ vbroadcastsd -8(%r13), %ymm14
+ vfnmadd231pd %ymm12, %ymm14, %ymm3
+ vfnmadd231pd %ymm13, %ymm14, %ymm7
+ vbroadcastsd -8(%r13, %r14, 1), %ymm15
+ vfnmadd231pd %ymm13, %ymm15, %ymm11
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_8x8_lib4, .-inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x8_lib4, @function
+inner_scale_ab_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x8_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_lib4:
+#endif
+#endif
+
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 128(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 160(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 192(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 224(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x8_lib4, .-inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- &alpha
+// r11 <- &beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_8x8_lib4, @function
+inner_tran_scale_ab_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_8x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_ab_8x8_lib4:
+#endif
+#endif
+
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+ vmulpd %ymm2, %ymm15, %ymm2
+ vmulpd %ymm3, %ymm15, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm4, %ymm15, %ymm4
+ vmulpd %ymm5, %ymm15, %ymm5
+ vmulpd %ymm6, %ymm15, %ymm6
+ vmulpd %ymm7, %ymm15, %ymm7
+
+ vunpcklpd %ymm9, %ymm8, %ymm12
+ vunpckhpd %ymm9, %ymm8, %ymm13
+ vunpcklpd %ymm11, %ymm10, %ymm14
+ vunpckhpd %ymm11, %ymm10, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm8
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm10
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm9
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm11
+
+ vbroadcastsd 0(%r10), %ymm15 // alpha
+
+ vmulpd %ymm8, %ymm15, %ymm8
+ vmulpd %ymm9, %ymm15, %ymm9
+ vmulpd %ymm10, %ymm15, %ymm10
+ vmulpd %ymm11, %ymm15, %ymm11
+
+ vbroadcastsd 0(%r11), %ymm14 // beta
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomisd %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovapd 0(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 128(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r12), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 128(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 160(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 192(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 224(%r12, %r13, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_8x8_lib4, .-inner_tran_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x8_lib4, @function
+inner_scale_11_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_scale_11_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x8_lib4; .scl 2; .type 32; .endef
+inner_scale_11_8x8_lib4:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14 // beta=1.0
+#else
+ vmovapd LC04(%rip), %ymm14 // beta=1.0
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 0(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 32(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 64(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 96(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 128(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 160(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 192(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 224(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x8_lib4, .-inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm8 <- [d80 d91 da2 db3]
+// ymm9 <- [d81 d90 da3 db2]
+// ymm10 <- [d83 d92 da1 db0]
+// ymm11 <- [d82 d93 da0 db1]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_11_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_11_8x8_lib4, @function
+inner_tran_scale_11_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_tran_scale_11_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_11_8x8_lib4; .scl 2; .type 32; .endef
+inner_tran_scale_11_8x8_lib4:
+#endif
+#endif
+
+
+ vunpcklpd %ymm1, %ymm0, %ymm12
+ vunpckhpd %ymm1, %ymm0, %ymm13
+ vunpcklpd %ymm3, %ymm2, %ymm14
+ vunpckhpd %ymm3, %ymm2, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm0
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm2
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm1
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm3
+
+ vunpcklpd %ymm5, %ymm4, %ymm12
+ vunpckhpd %ymm5, %ymm4, %ymm13
+ vunpcklpd %ymm7, %ymm6, %ymm14
+ vunpckhpd %ymm7, %ymm6, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm4
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm6
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm5
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm7
+
+ vunpcklpd %ymm9, %ymm8, %ymm12
+ vunpckhpd %ymm9, %ymm8, %ymm13
+ vunpcklpd %ymm11, %ymm10, %ymm14
+ vunpckhpd %ymm11, %ymm10, %ymm15
+
+ vperm2f128 $0x20, %ymm14, %ymm12, %ymm8
+ vperm2f128 $0x31, %ymm14, %ymm12, %ymm10
+ vperm2f128 $0x20, %ymm15, %ymm13, %ymm9
+ vperm2f128 $0x31, %ymm15, %ymm13, %ymm11
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovapd .LC04(%rip), %ymm14 // beta=1.0
+#else
+ vmovapd LC04(%rip), %ymm14 // beta=1.0
+#endif
+
+ vmovapd 0(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm0
+ vmovapd 32(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm1
+ vmovapd 64(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm2
+ vmovapd 96(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm3
+
+ vmovapd 128(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm4
+ vmovapd 160(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm5
+ vmovapd 192(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm6
+ vmovapd 224(%r10), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm7
+
+ vmovapd 128(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm8
+ vmovapd 160(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm9
+ vmovapd 192(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm10
+ vmovapd 224(%r10, %r11, 1), %ymm15
+ vfmadd231pd %ymm14, %ymm15, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_11_8x8_lib4, .-inner_tran_scale_11_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_8x8_vs_lib4, @function
+inner_edge_dpotrf_8x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_8x8_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_8x8_vs_lib4:
+#endif
+#endif
+
+ vxorpd %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ vmovsd %xmm13, 0(%r10)
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vpermpd $0x55, %ymm0, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vperm2f128 $0x00, %ymm4, %ymm4, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm8
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm9
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm11
+
+ vpermilpd $0x3, %xmm1, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ vmovsd %xmm13, 8(%r10)
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vperm2f128 $0x00, %ymm5, %ymm5, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm8
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm9
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm11
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ vmovsd %xmm13, 16(%r10)
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vpermpd $0xff, %ymm2, %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vperm2f128 $0x00, %ymm6, %ymm6, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm8
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm9
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm11
+
+ vpermpd $0xff, %ymm3, %ymm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vperm2f128 $0x00, %ymm7, %ymm7, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm8
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm9
+ vperm2f128 $0x11, %ymm7, %ymm7, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm11
+
+ vmovsd %xmm8, %xmm8, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 9f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+10:
+ vmovsd %xmm13, 32(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm8, %ymm13, %ymm8
+ cmpl $6, %r11d
+ jl 0f // ret
+// vperm2f128 $0x00, %ymm8, %ymm8, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0x55, %ymm8, %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vperm2f128 $0x11, %ymm8, %ymm8, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vpermilpd $0x3, %xmm9, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 11f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+12:
+ vmovsd %xmm13, 40(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm9, %ymm13, %ymm9
+ cmpl $7, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm9, %ymm9, %ymm12
+ vpermilpd $0x0, %ymm12, %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vpermilpd $0xf, %ymm12, %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vextractf128 $0x1, %ymm10, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 13f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+14:
+ vmovsd %xmm13, 48(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm10, %ymm13, %ymm10
+ cmpl $8, %r11d
+ jl 0f // ret
+// vperm2f128 $0x11, %ymm10, %ymm10, %ymm12
+// vpermilpd $0xf, %ymm12, %ymm13
+ vpermpd $0xff, %ymm10, %ymm13
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+// vextractf128 $0x1, %ymm11, %xmm13
+// vpermilpd $0x3, %xmm13, %xmm13
+ vpermpd $0xff, %ymm11, %ymm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 15f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+16:
+ vmovsd %xmm13, 56(%r10)
+// vmovddup %xmm13, %xmm13
+// vperm2f128 $0x00, %ymm13, %ymm13, %ymm13
+ vpermpd $0x00, %ymm13, %ymm13
+ vmulpd %ymm11, %ymm13, %ymm11
+
+
+
+ jmp 0f
+
+1:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+9:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 10b
+
+11:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 12b
+
+13:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 14b
+
+15:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 16b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_8x8_vs_lib4, .-inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X8L_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x8l_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x8l_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x8l_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x8l_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x8l_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm11
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm11
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm11
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm11
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm8, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm9, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm10, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm11, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x8l_lib4, .-inner_edge_dtrsm_rlt_inv_8x8l_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_E
+// r13 <- D
+// r14 <- sdd
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_E
+// r13 <- D
+// r14 <- sdd
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X8U_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x8u_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x8u_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x8u_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x8u_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x8u_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+
+ vmovapd 0(%r13, %r14, 1), %ymm12
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+
+ vmovapd 32(%r13, %r14, 1), %ymm12
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+
+ vmovapd 64(%r13, %r14, 1), %ymm12
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+ vmovapd 96(%r13, %r14, 1), %ymm12
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x8u_lib4, .-inner_edge_dtrsm_rlt_inv_8x8u_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X8L_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vmulpd %ymm4, %ymm13, %ymm4
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm11
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vmulpd %ymm5, %ymm13, %ymm5
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm11
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vmulpd %ymm6, %ymm13, %ymm6
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm11
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+ vmulpd %ymm7, %ymm13, %ymm7
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm7, %ymm13, %ymm11
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm8, %ymm13, %ymm8
+ cmpl $6, %r13d
+ jl 0f // ret
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm9, %ymm13, %ymm9
+ cmpl $7, %r13d
+ jl 0f // ret
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm10, %ymm13, %ymm10
+ cmpl $8, %r13d
+ jl 0f // ret
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm11, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_E
+// r13 <- D
+// r14 <- sdd
+// r15d <- kn
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- sde
+// r12 <- inv_diag_E
+// r13 <- D
+// r14 <- sdd
+// r15d <- kn
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d51 d62 d73]
+// ymm5 <- [d41 d50 d63 d72]
+// ymm6 <- [d43 d52 d61 d70]
+// ymm7 <- [d42 d53 d60 d71]
+// ymm12 <- dirty
+// ymm13 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_8X8U_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4:
+#endif
+#endif
+
+ vbroadcastsd 0(%r12), %ymm13
+ vmulpd %ymm0, %ymm13, %ymm0
+ vbroadcastsd 8(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm1
+ vbroadcastsd 16(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm2
+ vbroadcastsd 24(%r10), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm3
+
+ vmovapd 0(%r13, %r14, 1), %ymm12
+ vbroadcastsd 0(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm0, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+ vbroadcastsd 8(%r12), %ymm13
+ vmulpd %ymm1, %ymm13, %ymm1
+ vbroadcastsd 48(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm2
+ vbroadcastsd 56(%r10), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm3
+
+ vmovapd 32(%r13, %r14, 1), %ymm12
+ vbroadcastsd 32(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 40(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm1, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+ vbroadcastsd 16(%r12), %ymm13
+ vmulpd %ymm2, %ymm13, %ymm2
+ vbroadcastsd 88(%r10), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm3
+
+ vmovapd 64(%r13, %r14, 1), %ymm12
+ vbroadcastsd 64(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 72(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 80(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm2, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+
+ vbroadcastsd 24(%r12), %ymm13
+ vmulpd %ymm3, %ymm13, %ymm3
+
+ vmovapd 96(%r13, %r14, 1), %ymm12
+ vbroadcastsd 96(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm4
+ vfnmadd231pd %ymm12, %ymm13, %ymm8
+ vbroadcastsd 104(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm5
+ vfnmadd231pd %ymm12, %ymm13, %ymm9
+ vbroadcastsd 112(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm6
+ vfnmadd231pd %ymm12, %ymm13, %ymm10
+ vbroadcastsd 120(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm3, %ymm13, %ymm7
+ vfnmadd231pd %ymm12, %ymm13, %ymm11
+
+ addq $128, %r10
+
+ vbroadcastsd 32(%r12), %ymm13
+ vmulpd %ymm4, %ymm13, %ymm4
+ vmulpd %ymm8, %ymm13, %ymm8
+ cmpl $6, %r15d
+ jl 0f // ret
+ vbroadcastsd 8(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm5
+ vfnmadd231pd %ymm8, %ymm13, %ymm9
+ vbroadcastsd 16(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm6
+ vfnmadd231pd %ymm8, %ymm13, %ymm10
+ vbroadcastsd 24(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm4, %ymm13, %ymm7
+ vfnmadd231pd %ymm8, %ymm13, %ymm11
+
+ vbroadcastsd 40(%r12), %ymm13
+ vmulpd %ymm5, %ymm13, %ymm5
+ vmulpd %ymm9, %ymm13, %ymm9
+ cmpl $7, %r15d
+ jl 0f // ret
+ vbroadcastsd 48(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm6
+ vfnmadd231pd %ymm9, %ymm13, %ymm10
+ vbroadcastsd 56(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm5, %ymm13, %ymm7
+ vfnmadd231pd %ymm9, %ymm13, %ymm11
+
+ vbroadcastsd 48(%r12), %ymm13
+ vmulpd %ymm6, %ymm13, %ymm6
+ vmulpd %ymm10, %ymm13, %ymm10
+ cmpl $8, %r15d
+ jl 0f // ret
+ vbroadcastsd 88(%r10, %r11, 1), %ymm13
+ vfnmadd231pd %ymm6, %ymm13, %ymm7
+ vfnmadd231pd %ymm10, %ymm13, %ymm11
+
+ vbroadcastsd 56(%r12), %ymm13
+ vmulpd %ymm7, %ymm13, %ymm7
+ vmulpd %ymm11, %ymm13, %ymm11
+
+
+
+// subq $128, %r10
+// vmovapd 0(%r10, %r11, 1), %ymm4
+// vmovapd 32(%r10, %r11, 1), %ymm5
+// vmovapd 64(%r10, %r11, 1), %ymm6
+// vmovapd 96(%r10, %r11, 1), %ymm7
+
+
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4, .-inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8L_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8l_lib4, @function
+inner_store_8x8l_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8l_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8l_lib4; .scl 2; .type 32; .endef
+inner_store_8x8l_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+ vmovapd %ymm8, 128(%r10, %r11, 1)
+ vmovapd %ymm9, 160(%r10, %r11, 1)
+ vmovapd %ymm10, 192(%r10, %r11, 1)
+ vmovapd %ymm11, 224(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8l_lib4, .-inner_store_8x8l_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8U_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8u_lib4, @function
+inner_store_8x8u_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8u_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8u_lib4; .scl 2; .type 32; .endef
+inner_store_8x8u_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 128(%r10)
+ vmovapd %ymm5, 160(%r10)
+ vmovapd %ymm6, 192(%r10)
+ vmovapd %ymm7, 224(%r10)
+
+ vmovapd %ymm8, 128(%r10, %r11, 1)
+ vmovapd %ymm9, 160(%r10, %r11, 1)
+ vmovapd %ymm10, 192(%r10, %r11, 1)
+ vmovapd %ymm11, 224(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8u_lib4, .-inner_store_8x8u_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8L_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8l_vs_lib4, @function
+inner_store_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8l_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x8l_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+ vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
+ vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
+ vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
+ vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
+
+ vmaskmovpd %ymm8, %ymm15, 128(%r10, %r11, 1)
+ cmpl $6, %r13d
+ jl 0f // end
+ vmaskmovpd %ymm9, %ymm15, 160(%r10, %r11, 1)
+ cmpl $7, %r13d
+ jl 0f // end
+ vmaskmovpd %ymm10, %ymm15, 192(%r10, %r11, 1)
+ je 0f // end
+ vmaskmovpd %ymm11, %ymm15, 224(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8l_vs_lib4, .-inner_store_8x8l_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d91 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8U_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8u_vs_lib4, @function
+inner_store_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8u_vs_lib4; .scl 2; .type 32; .endef
+inner_store_8x8u_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd %ymm1, 32(%r10)
+ vmovapd %ymm2, 64(%r10)
+ vmovapd %ymm3, 96(%r10)
+
+
+ vmovapd %ymm4, 128(%r10)
+ vmaskmovpd %ymm8, %ymm15, 128(%r10, %r11, 1)
+ cmpl $6, %r13d
+ jl 0f // end
+ vmovapd %ymm5, 160(%r10)
+ vmaskmovpd %ymm9, %ymm15, 160(%r10, %r11, 1)
+ cmpl $7, %r13d
+ jl 0f // end
+ vmovapd %ymm6, 192(%r10)
+ vmaskmovpd %ymm10, %ymm15, 192(%r10, %r11, 1)
+ je 0f // end
+ vmovapd %ymm7, 224(%r10)
+ vmaskmovpd %ymm11, %ymm15, 224(%r10, %r11, 1)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8u_vs_lib4, .-inner_store_8x8u_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_lib4, @function
+inner_store_l_8x8_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x8_lib4:
+#endif
+#endif
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+ vmovapd %ymm4, 0(%r10, %r11, 1)
+ vmovapd %ymm5, 32(%r10, %r11, 1)
+ vmovapd %ymm6, 64(%r10, %r11, 1)
+ vmovapd %ymm7, 96(%r10, %r11, 1)
+
+ vmovapd %ymm8, 128(%r10, %r11, 1)
+ vmovapd 160(%r10, %r11, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm9, %ymm9
+ vmovapd %ymm9, 160(%r10, %r11, 1)
+ vmovapd 192(%r10, %r11, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm10, %ymm10
+ vmovapd %ymm10, 192(%r10, %r11, 1)
+ vmovapd 224(%r10, %r11, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm11, %ymm11
+ vmovapd %ymm11, 224(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x8_lib4, .-inner_store_l_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d50 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12d <- km
+// r13d <- kn
+// r14 <- dirty
+// r15 <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+// ymm8 <- [d80 d90 da0 db0]
+// ymm9 <- [d81 d90 da1 db1]
+// ymm10 <- [d82 d92 da2 db2]
+// ymm11 <- [d83 d93 da3 db3]
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_vs_lib4, @function
+inner_store_l_8x8_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_8x8_vs_lib4:
+#endif
+#endif
+
+ vcvtsi2sd %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovupd LC03(%rip), %ymm14
+#endif
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm15, %ymm14, %ymm15
+
+ vmovapd %ymm0, 0(%r10)
+ vmovapd 32(%r10), %ymm14
+ vblendpd $0x1, %ymm14, %ymm1, %ymm1
+ vmovapd %ymm1, 32(%r10)
+ vmovapd 64(%r10), %ymm14
+ vblendpd $0x3, %ymm14, %ymm2, %ymm2
+ vmovapd %ymm2, 64(%r10)
+ vmovapd 96(%r10), %ymm14
+ vblendpd $0x7, %ymm14, %ymm3, %ymm3
+ vmovapd %ymm3, 96(%r10)
+
+ vmaskmovpd %ymm4, %ymm15, 0(%r10, %r11, 1)
+ vmaskmovpd %ymm5, %ymm15, 32(%r10, %r11, 1)
+ vmaskmovpd %ymm6, %ymm15, 64(%r10, %r11, 1)
+ vmaskmovpd %ymm7, %ymm15, 96(%r10, %r11, 1)
+
+ vmaskmovpd %ymm8, %ymm15, 128(%r10, %r11, 1)
+ cmpl $6, %r13d
+ jl 0f // end
+ vmovapd 160(%r10, %r11, 1), %ymm14
+ vblendpd $0x1, %ymm14, %ymm9, %ymm9
+ vmaskmovpd %ymm9, %ymm15, 160(%r10, %r11, 1)
+ cmpl $7, %r13d
+ jl 0f // end
+ vmovapd 192(%r10, %r11, 1), %ymm14
+ vblendpd $0x3, %ymm14, %ymm10, %ymm10
+ vmaskmovpd %ymm10, %ymm15, 192(%r10, %r11, 1)
+ je 0f // end
+ vmovapd 224(%r10, %r11, 1), %ymm14
+ vblendpd $0x7, %ymm14, %ymm11, %ymm11
+ vmaskmovpd %ymm11, %ymm15, 224(%r10, %r11, 1)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x8_vs_lib4, .-inner_store_l_8x8_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// rbp <- dirty
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm4 <- [d40 d50 d60 d70]
+// ymm5 <- [d41 d51 d61 d71]
+// ymm6 <- [d42 d52 d62 d72]
+// ymm7 <- [d43 d53 d63 d73]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_gen_lib4, @function
+inner_store_8x8_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_8x8_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_gen_lib4; .scl 2; .type 32; .endef
+inner_store_8x8_gen_lib4:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2sd %r13d, %xmm14, %xmm14
+ vcvtsi2sd %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm12
+ vmovupd .LC03(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm12
+ vmovupd LC03(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vmovddup %xmm15, %xmm15
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $1, %xmm15, %ymm15, %ymm15
+ vsubpd %ymm12, %ymm14, %ymm14
+ vsubpd %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm3, %ymm2
+ vmovapd %ymm7, %ymm6
+ vmovapd %ymm8, %ymm7
+ vmovapd %ymm9, %ymm8
+ vmovapd %ymm10, %ymm9
+ vmovapd %ymm11, %ymm10
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm2, %ymm1
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm7, %ymm6
+ vmovapd %ymm8, %ymm7
+ vmovapd %ymm9, %ymm8
+ vmovapd %ymm10, %ymm9
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %ymm1, %ymm0
+ vmovapd %ymm5, %ymm4
+ vmovapd %ymm6, %ymm5
+ vmovapd %ymm7, %ymm6
+ vmovapd %ymm8, %ymm7
+ vmovapd %ymm9, %ymm8
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmaskmovpd %ymm0, %ymm14, 0(%r11)
+ vmaskmovpd %ymm1, %ymm14, 32(%r11)
+ vmaskmovpd %ymm2, %ymm14, 64(%r11)
+ vmaskmovpd %ymm3, %ymm14, 96(%r11)
+
+ vmaskmovpd %ymm4, %ymm15, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm5, %ymm15, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm6, %ymm15, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm7, %ymm15, 96(%r11, %r12, 1)
+
+ vmaskmovpd %ymm8, %ymm15, 128(%r11, %r12, 1)
+ cmpl $6, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm9, %ymm15, 160(%r11, %r12, 1)
+ cmpl $7, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm10, %ymm15, 192(%r11, %r12, 1)
+ je 4f // end
+ vmaskmovpd %ymm11, %ymm15, 224(%r11, %r12, 1)
+
+ jmp 4f
+
+0:
+
+ cmpl $1, %r10d
+ jg 1f
+
+ // offset==1
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm12
+ vshufpd $0x5, %ymm0, %ymm12, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm12
+ vshufpd $0x5, %ymm4, %ymm12, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm12
+ vshufpd $0x5, %ymm1, %ymm12, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm12
+ vshufpd $0x5, %ymm5, %ymm12, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm12
+ vshufpd $0x5, %ymm2, %ymm12, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm12
+ vshufpd $0x5, %ymm6, %ymm12, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm12
+ vshufpd $0x5, %ymm3, %ymm12, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm12
+ vshufpd $0x5, %ymm7, %ymm12, %ymm7
+
+ vperm2f128 $0x01, %ymm8, %ymm8, %ymm12
+ vshufpd $0x5, %ymm8, %ymm12, %ymm8
+
+ vperm2f128 $0x01, %ymm9, %ymm9, %ymm12
+ vshufpd $0x5, %ymm9, %ymm12, %ymm9
+
+ vperm2f128 $0x01, %ymm10, %ymm10, %ymm12
+ vshufpd $0x5, %ymm10, %ymm12, %ymm10
+
+ vperm2f128 $0x01, %ymm11, %ymm11, %ymm12
+ vshufpd $0x5, %ymm11, %ymm12, %ymm11
+
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm15, %ymm12, %ymm15
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm14, %ymm12, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vandpd .LC08(%rip), %ymm14, %ymm12
+ vandpd .LC05(%rip), %ymm15, %ymm13
+#elif defined(OS_MAC)
+ vandpd LC08(%rip), %ymm14, %ymm12
+ vandpd LC05(%rip), %ymm15, %ymm13
+#endif
+
+ vblendpd $0x1, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vandpd .LC08(%rip), %ymm15, %ymm15
+#elif defined(OS_MAC)
+ vandpd LC08(%rip), %ymm15, %ymm15
+#endif
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ // offset==2
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x03, %ymm4, %ymm0, %ymm0
+ vperm2f128 $0x03, %ymm13, %ymm4, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x03, %ymm5, %ymm1, %ymm1
+ vperm2f128 $0x03, %ymm13, %ymm5, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x03, %ymm6, %ymm2, %ymm2
+ vperm2f128 $0x03, %ymm13, %ymm6, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x03, %ymm7, %ymm3, %ymm3
+ vperm2f128 $0x03, %ymm13, %ymm7, %ymm7
+
+ vperm2f128 $0x01, %ymm8, %ymm8, %ymm8
+
+ vperm2f128 $0x01, %ymm9, %ymm9, %ymm9
+
+ vperm2f128 $0x01, %ymm10, %ymm10, %ymm10
+
+ vperm2f128 $0x01, %ymm11, %ymm11, %ymm11
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vandpd .LC09(%rip), %ymm14, %ymm12
+ vandpd .LC06(%rip), %ymm15, %ymm13
+#elif defined(OS_MAC)
+ vandpd LC09(%rip), %ymm14, %ymm12
+ vandpd LC06(%rip), %ymm15, %ymm13
+#endif
+
+ vblendpd $0x3, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vandpd .LC09(%rip), %ymm15, %ymm15
+#elif defined(OS_MAC)
+ vandpd LC09(%rip), %ymm15, %ymm15
+#endif
+
+ jmp 3f
+
+2:
+
+ // offset==3
+
+ vmovapd %ymm0, %ymm13
+ vperm2f128 $0x21, %ymm0, %ymm4, %ymm12
+ vshufpd $0x5, %ymm12, %ymm4, %ymm0
+ vperm2f128 $0x21, %ymm4, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm4
+
+ vmovapd %ymm1, %ymm13
+ vperm2f128 $0x21, %ymm1, %ymm5, %ymm12
+ vshufpd $0x5, %ymm12, %ymm5, %ymm1
+ vperm2f128 $0x21, %ymm5, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm5
+
+ vmovapd %ymm2, %ymm13
+ vperm2f128 $0x21, %ymm2, %ymm6, %ymm12
+ vshufpd $0x5, %ymm12, %ymm6, %ymm2
+ vperm2f128 $0x21, %ymm6, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm6
+
+ vmovapd %ymm3, %ymm13
+ vperm2f128 $0x21, %ymm3, %ymm7, %ymm12
+ vshufpd $0x5, %ymm12, %ymm7, %ymm3
+ vperm2f128 $0x21, %ymm7, %ymm13, %ymm12
+ vshufpd $0x5, %ymm12, %ymm13, %ymm7
+
+ vperm2f128 $0x01, %ymm8, %ymm8, %ymm12
+ vshufpd $0x5, %ymm12, %ymm8, %ymm8
+
+ vperm2f128 $0x01, %ymm9, %ymm9, %ymm12
+ vshufpd $0x5, %ymm12, %ymm9, %ymm9
+
+ vperm2f128 $0x01, %ymm10, %ymm10, %ymm12
+ vshufpd $0x5, %ymm12, %ymm10, %ymm10
+
+ vperm2f128 $0x01, %ymm11, %ymm11, %ymm12
+ vshufpd $0x5, %ymm12, %ymm11, %ymm11
+
+ vperm2f128 $0x01, %ymm14, %ymm14, %ymm12
+ vshufpd $0x5, %ymm12, %ymm14, %ymm14
+ vperm2f128 $0x01, %ymm15, %ymm15, %ymm12
+ vshufpd $0x5, %ymm12, %ymm15, %ymm15
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vandpd .LC10(%rip), %ymm14, %ymm12
+ vandpd .LC07(%rip), %ymm15, %ymm13
+#elif defined(OS_MAC)
+ vandpd LC10(%rip), %ymm14, %ymm12
+ vandpd LC07(%rip), %ymm15, %ymm13
+#endif
+
+ vblendpd $0x7, %ymm14, %ymm15, %ymm14
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vandpd .LC10(%rip), %ymm15, %ymm15
+#elif defined(OS_MAC)
+ vandpd LC10(%rip), %ymm15, %ymm15
+#endif
+
+3:
+
+ vmaskmovpd %ymm0, %ymm12, 0(%r11)
+ vmaskmovpd %ymm4, %ymm14, 0(%r11, %r12, 1)
+ vmaskmovpd %ymm0, %ymm13, 0(%r11, %r12, 2)
+ vmaskmovpd %ymm1, %ymm12, 32(%r11)
+ vmaskmovpd %ymm5, %ymm14, 32(%r11, %r12, 1)
+ vmaskmovpd %ymm1, %ymm13, 32(%r11, %r12, 2)
+ vmaskmovpd %ymm2, %ymm12, 64(%r11)
+ vmaskmovpd %ymm6, %ymm14, 64(%r11, %r12, 1)
+ vmaskmovpd %ymm2, %ymm13, 64(%r11, %r12, 2)
+ vmaskmovpd %ymm3, %ymm12, 96(%r11)
+ vmaskmovpd %ymm7, %ymm14, 96(%r11, %r12, 1)
+ vmaskmovpd %ymm3, %ymm13, 96(%r11, %r12, 2)
+
+ vmaskmovpd %ymm8, %ymm15, 128(%r11, %r12, 1)
+ vmaskmovpd %ymm8, %ymm13, 128(%r11, %r12, 2)
+ cmpl $6, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm9, %ymm15, 160(%r11, %r12, 1)
+ vmaskmovpd %ymm9, %ymm13, 160(%r11, %r12, 2)
+ cmpl $7, %r15d
+ jl 4f // end
+ vmaskmovpd %ymm10, %ymm15, 192(%r11, %r12, 1)
+ vmaskmovpd %ymm10, %ymm13, 192(%r11, %r12, 2)
+ je 4f // end
+ vmaskmovpd %ymm11, %ymm15, 224(%r11, %r12, 1)
+ vmaskmovpd %ymm11, %ymm13, 224(%r11, %r12, 2)
+
+4:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_gen_lib4, .-inner_store_8x8_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dgemm_nt_8x8l_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x8l_lib4
+ .type kernel_dgemm_nt_8x8l_lib4, @function
+kernel_dgemm_nt_8x8l_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x8l_lib4
+_kernel_dgemm_nt_8x8l_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x8l_lib4
+ .def kernel_dgemm_nt_8x8l_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8l_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ movq ARG6, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8L_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8l_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8l_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x8l_lib4, .-kernel_dgemm_nt_8x8l_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dgemm_nt_8x8u_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x8u_lib4
+ .type kernel_dgemm_nt_8x8u_lib4, @function
+kernel_dgemm_nt_8x8u_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x8u_lib4
+_kernel_dgemm_nt_8x8u_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x8u_lib4
+ .def kernel_dgemm_nt_8x8u_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8u_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG5, %r11 // B
+ movq ARG6, %r12 // sdb
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG3, %r13 // A
+ movq ARG4, %r14 // sda
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8U_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8u_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8u_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x8u_lib4, .-kernel_dgemm_nt_8x8u_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_dgemm_nt_8x8l_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x8l_vs_lib4
+ .type kernel_dgemm_nt_8x8l_vs_lib4, @function
+kernel_dgemm_nt_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x8l_vs_lib4
+_kernel_dgemm_nt_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x8l_vs_lib4
+ .def kernel_dgemm_nt_8x8l_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8l_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ movq ARG6, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8l_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8l_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x8l_vs_lib4, .-kernel_dgemm_nt_8x8l_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_dgemm_nt_8x8u_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x8u_vs_lib4
+ .type kernel_dgemm_nt_8x8u_vs_lib4, @function
+kernel_dgemm_nt_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x8u_vs_lib4
+_kernel_dgemm_nt_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x8u_vs_lib4
+ .def kernel_dgemm_nt_8x8u_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8u_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG5, %r11 // B
+ movq ARG6, %r12 // sdb
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG3, %r13 // A
+ movq ARG4, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG12, %r12 // km
+ movq ARG13, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8u_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8u_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x8u_vs_lib4, .-kernel_dgemm_nt_8x8u_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
+// void kernel_dgemm_nt_8x8_gen_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_8x8_gen_lib4
+ .type kernel_dgemm_nt_8x8_gen_lib4, @function
+kernel_dgemm_nt_8x8_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_8x8_gen_lib4
+_kernel_dgemm_nt_8x8_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_8x8_gen_lib4
+ .def kernel_dgemm_nt_8x8_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_8x8_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ movq ARG6, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_8x8_lib4, .-kernel_dgemm_nt_8x8_lib4
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dsyrk_nt_8x8_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x8_lib4
+ .type kernel_dsyrk_nt_l_8x8_lib4, @function
+kernel_dsyrk_nt_l_8x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x8_lib4
+_kernel_dsyrk_nt_l_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x8_lib4
+ .def kernel_dsyrk_nt_l_8x8_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ movq ARG6, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x8_lib4, .-kernel_dsyrk_nt_l_8x8_lib4
+#endif
+
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_dsyrk_nt_8x8_vs_lib4(int k, double *alpha, double *A, int sda, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_8x8_vs_lib4
+ .type kernel_dsyrk_nt_l_8x8_vs_lib4, @function
+kernel_dsyrk_nt_l_8x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_8x8_vs_lib4
+_kernel_dsyrk_nt_l_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_8x8_vs_lib4
+ .def kernel_dsyrk_nt_l_8x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_8x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG5, %r13 // B
+ movq ARG6, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+ movq ARG9, %r13 // sdc
+ sall $5, %r13d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG10, %r10 // D
+ movq ARG11, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG12, %r12 // D
+ movq ARG13, %r13 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_8x8_vs_lib4, .-kernel_dsyrk_nt_l_8x8_vs_lib4
+#endif
+
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_dpotrf_nt_l_8x8_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x8_lib4
+ .type kernel_dpotrf_nt_l_8x8_lib4, @function
+kernel_dpotrf_nt_l_8x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x8_lib4
+_kernel_dpotrf_nt_l_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x8_lib4
+ .def kernel_dpotrf_nt_l_8x8_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+ movl $8, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // store address D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x8_lib4, .-kernel_dpotrf_nt_l_8x8_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_dpotrf_nt_l_8x8_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_8x8_vs_lib4
+ .type kernel_dpotrf_nt_l_8x8_vs_lib4, @function
+kernel_dpotrf_nt_l_8x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_8x8_vs_lib4
+_kernel_dpotrf_nt_l_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_8x8_vs_lib4
+ .def kernel_dpotrf_nt_l_8x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_8x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13 // B
+ movq ARG5, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG10, %r10 // inv_diag_D
+ movq ARG12, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // store address D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_8x8_vs_lib4, .-kernel_dpotrf_nt_l_8x8_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_dsyrk_dpotrf_nt_l_8x8_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x8_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x8_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x8_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x8_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x8_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+ movq ARG5, %r14 // sdbp
+ sall $5, %r14d // 4*sdbp*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ movq ARG6, %r10 // km
+ movq ARG7, %r11 // Am
+ movq ARG8, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG9, %r13 // Bm
+ movq ARG10, %r14 // sdbm
+ sall $5, %r14d // 4*sdbm*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG11, %r10 // C
+ movq ARG12, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG15, %r10 // inv_diag_D
+ movl $8, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG13, %r10 // store address D
+ movq ARG14, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x8_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x8_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
+// void kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+ movq ARG5, %r14 // sdbp
+ sall $5, %r14d // 4*sdbp*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG6, %r10 // km
+ movq ARG7, %r11 // Am
+ movq ARG8, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG9, %r13 // Bm
+ movq ARG10, %r14 // sdbm
+ sall $5, %r14d // 4*sdbm*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG11, %r10 // C
+ movq ARG12, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG15, %r10 // inv_diag_D
+ movq ARG17, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_8x8_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG13, %r10 // store address D
+ movq ARG14, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG16, %r12 // km
+ movq ARG17, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_dtrsm_nt_rl_inv_8x8l_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x8l_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x8l_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x8l_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x8l_lib4
+_kernel_dtrsm_nt_rl_inv_8x8l_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x8l_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x8l_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x8l_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+ movq ARG5, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG6, %r10
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X8L_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x8l_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x8l_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // store address D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8L_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8l_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8l_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x8l_lib4, .-kernel_dtrsm_nt_rl_inv_8x8l_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_dtrsm_nt_rl_inv_8x8u_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x8u_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x8u_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x8u_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x8u_lib4
+_kernel_dtrsm_nt_rl_inv_8x8u_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x8u_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x8u_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x8u_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG4, %r11
+ movq ARG5, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG2, %r13
+ movq ARG3, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG6, %r10
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+ movq ARG8, %r13 // D
+ movq ARG9, %r14 // sdd
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X8U_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x8u_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x8u_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // store address D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8U_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8u_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8u_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x8u_lib4, .-kernel_dtrsm_nt_rl_inv_8x8u_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+ movq ARG5, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG6, %r10
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // store address D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8l_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8l_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4(int k, double *A, int sda, double *B, int sdb, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
+_kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG4, %r11
+ movq ARG5, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG2, %r13
+ movq ARG3, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG6, %r10 // C
+ movq ARG7, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG10, %r10 // E
+ movq ARG11, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG12, %r12 // inv_diag_E
+ movq ARG8, %r13 // D
+ movq ARG9, %r14 // sdd
+ sall $5, %r14d // 4*sdc*sizeof(double)
+ movq ARG14, %r15 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // store address D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8u_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8u_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4, .-kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+ movq ARG5, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ movq ARG6, %r10
+ movq ARG7, %r11
+ movq ARG8, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG9, %r13
+ movq ARG10, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG11, %r10
+ movq ARG12, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG15, %r10 // E
+ movq ARG16, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG17, %r12 // inv_diag_E
+ movq ARG19, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x8l_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG13, %r10 // store address D
+ movq ARG14, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG18, %r12 // km
+ movq ARG19, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8L_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8l_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8l_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
+// void kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdbp, int km, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovapd %ymm0, %ymm8
+ vmovapd %ymm0, %ymm9
+ vmovapd %ymm0, %ymm10
+ vmovapd %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG4, %r11
+ movq ARG5, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG2, %r13
+ movq ARG3, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_8x8_lib4
+#endif
+#endif
+
+
+ movq ARG6, %r10
+ movq ARG9, %r11
+ movq ARG10, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG7, %r13
+ movq ARG8, %r14
+ sall $5, %r14d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_8x8_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG11, %r10 // C
+ movq ARG12, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_8X8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_8x8_lib4
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_8x8_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG15, %r10 // E
+ movq ARG16, %r11 // sde
+ sall $5, %r11d // 4*sde*sizeof(double)
+ movq ARG17, %r12 // inv_diag_E
+ movq ARG13, %r13 // D
+ movq ARG14, %r14 // sdd
+ sall $5, %r14d // 4*sdc*sizeof(double)
+ movq ARG19, %r15 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_8x8u_vs_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG13, %r10 // store address D
+ movq ARG14, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG18, %r12 // km
+ movq ARG19, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8U_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8u_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8x8u_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC05: // { 1.0 1.0 1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC05: // { 1.0 1.0 1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC06: // { 1.0 1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC06: // { 1.0 1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC07: // { 1.0 -1.0 -1.0 -1.0 }
+#endif
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long 1072693248
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC08: // { -1.0 -1.0 -1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC08: // { -1.0 -1.0 -1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+ .long 0
+ .long -1074790400
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC10: // { -1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC10: // { -1.0 1.0 1.0 1.0 }
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long -1074790400
+
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgemv_8_lib4.S b/kernel/avx2/kernel_dgemv_8_lib4.S
new file mode 100644
index 0000000..1c9185a
--- /dev/null
+++ b/kernel/avx2/kernel_dgemv_8_lib4.S
@@ -0,0 +1,1543 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x+k*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_n_8_lib4, @function
+inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_n_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_n_8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $4, %r10d
+
+ vbroadcastsd 8(%r13), %ymm12
+ vmovapd 32(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vmovapd 32(%r15), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vbroadcastsd 16(%r13), %ymm12
+ vmovapd 64(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm4
+ vmovapd 64(%r15), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm5
+
+ vbroadcastsd 24(%r13), %ymm12
+ addq $32, %r13
+ vmovapd 96(%r11), %ymm8
+ addq $128, %r11
+ vfmadd231pd %ymm8, %ymm12, %ymm6
+ vmovapd 96(%r15), %ymm8
+ addq $128, %r15
+ vfmadd231pd %ymm8, %ymm12, %ymm7
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vmulpd %ymm8, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ addq $32, %r11
+ addq $32, %r15
+ addq $8, %r13
+
+ subl $1, %r10d
+ cmpl $0, %r10d
+
+ jg 0b // clean
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_n_8_lib4, .-inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x+k*sizeof(double)
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_t_8_lib4, @function
+inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_t_8_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_t_8_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vmovapd 128(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm4
+
+ vmovapd 160(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm5
+
+ vmovapd 192(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm6
+
+ vmovapd 224(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm7
+
+ addq %r12, %r11
+ addq $32, %r13
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm14
+
+ vmaskmovpd 0(%r13), %ymm14, %ymm12
+
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ vmovapd 32(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ vmovapd 64(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 96(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vmovapd 128(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm4
+
+ vmovapd 160(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm5
+
+ vmovapd 192(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm6
+
+ vmovapd 224(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm7
+
+ sall $3, %r10d
+// movslq %r10d, %r10
+ addq %r10, %r11
+ addq %r10, %r13
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_t_8_lib4, .-inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- k-4
+// r11 <- A+4*4*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- x+4*sizeof(double)
+// r15 <- dirty
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmv_un_8_lib4, @function
+inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmv_un_8_lib4:
+#endif
+#endif
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(double)
+
+ vxorpd %ymm14, %ymm14, %ymm14
+
+ // first 4 columns
+ vmovapd 0(%r11), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 0(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 8(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+
+ vmovapd 64(%r11), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vbroadcastsd 16(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm4
+
+ vmovapd 96(%r11), %ymm8
+ vbroadcastsd 24(%r13), %ymm12
+ vfmadd231pd %ymm8, %ymm12, %ymm6
+
+ addq $128, %r11
+ addq $128, %r15
+ addq $32, %r13
+
+
+
+ // last 4 columns
+ vbroadcastsd 0(%r13), %ymm12
+ vmovapd 0(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm0
+ vmovapd 0(%r15), %ymm8
+ vblendpd $0x1, %ymm8, %ymm14, %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm1
+
+ subl $4, %r10d
+
+ vbroadcastsd 8(%r13), %ymm12
+ vmovapd 32(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm2
+ vmovapd 32(%r15), %ymm8
+ vblendpd $0x3, %ymm8, %ymm14, %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm3
+
+ vbroadcastsd 16(%r13), %ymm12
+ vmovapd 64(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm4
+ vmovapd 64(%r15), %ymm8
+ vblendpd $0x7, %ymm8, %ymm14, %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm5
+
+ vbroadcastsd 24(%r13), %ymm12
+ vmovapd 96(%r11), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm6
+ vmovapd 96(%r15), %ymm8
+ vfmadd231pd %ymm8, %ymm12, %ymm7
+
+ addq $128, %r11
+ addq $128, %r15
+ addq $32, %r13
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmv_un_8_lib4, .-inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n
+//
+// input arguments:
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_8_lib4, @function
+inner_blend_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_8_lib4, .-inner_blend_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t
+//
+// input arguments:
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_8_lib4, @function
+inner_blend_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_8_lib4, .-inner_blend_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==n, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_n_scale_ab_8_lib4, @function
+inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_n_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_n_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_n_scale_ab_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+ vaddpd %ymm4, %ymm6, %ymm4
+ vaddpd %ymm5, %ymm7, %ymm5
+ vaddpd %ymm0, %ymm4, %ymm0
+ vaddpd %ymm1, %ymm5, %ymm1
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vfmadd231pd %ymm15, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vfmadd231pd %ymm15, %ymm14, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_n_scale_ab_8_lib4, .-inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm6 <- [z6a z6b z6c z6d]
+// ymm7 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_8_lib4, @function
+inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_8_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm1, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vfmadd231pd %ymm15, %ymm14, %ymm0
+ vmovupd 32(%r12), %ymm14
+ vfmadd231pd %ymm15, %ymm14, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_8_lib4, .-inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==n
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]_a
+// ymm1 <- [z4 z5 z6 z7]_a
+// ymm2 <- [z0 z1 z2 z3]_b
+// ymm3 <- [z4 z5 z6 z7]_b
+// ymm4 <- [z0 z1 z2 z3]_c
+// ymm5 <- [z4 z5 z6 z7]_c
+// ymm6 <- [z0 z1 z2 z3]_d
+// ymm7 <- [z4 z5 z6 z7]_d
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_N_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_n_8_lib4, @function
+inner_blender_n_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_n_8_lib4; .scl 2; .type 32; .endef
+inner_blender_n_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vaddpd %ymm0, %ymm2, %ymm0
+ vaddpd %ymm1, %ymm3, %ymm1
+ vaddpd %ymm4, %ymm6, %ymm4
+ vaddpd %ymm5, %ymm7, %ymm5
+ vaddpd %ymm0, %ymm4, %ymm0
+ vaddpd %ymm1, %ymm5, %ymm1
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_n_8_lib4, .-inner_blender_n_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for ta==t
+//
+// input arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm0 <- [z4a z4b z4c z4d]
+// ymm1 <- [z5a z5b z5c z5d]
+// ymm2 <- [z6a z6b z6c z6d]
+// ymm3 <- [z7a z7b z7c z7d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10d <- alg
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLENDER_T_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blender_t_8_lib4, @function
+inner_blender_t_8_lib4:
+#elif defined(OS_MAC)
+_inner_blender_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blender_t_8_lib4; .scl 2; .type 32; .endef
+inner_blender_t_8_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm5, %ymm4, %ymm4
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm7, %ymm6, %ymm6
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm3
+ vperm2f128 $0x2, %ymm4, %ymm6, %ymm5
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vperm2f128 $0x13, %ymm4, %ymm6, %ymm4
+ vaddpd %ymm0, %ymm3, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm1
+
+ cmpl $0, %r10d // alg
+ je 0f // return
+
+ cmpl $1, %r10d // alg
+ jne 1f // alg==-1
+
+ // alg==1
+ vmovupd 0(%r11), %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+
+ jmp 0f // return
+
+1:
+
+ // alg==-1
+ vmovupd 0(%r11), %ymm15
+ vsubpd %ymm0, %ymm15, %ymm0
+ vmovupd 32(%r11), %ymm15
+ vsubpd %ymm1, %ymm15, %ymm1
+
+0: // return
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blender_t_8_lib4, .-inner_blender_t_8_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 z6 z7]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8_lib4, @function
+inner_store_8_lib4:
+#elif defined(OS_MAC)
+_inner_store_8_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8_lib4; .scl 2; .type 32; .endef
+inner_store_8_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %ymm1, 32(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8_lib4, .-inner_store_8_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_n_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_n_8_lib4
+ .type kernel_dgemv_n_8_lib4, @function
+kernel_dgemv_n_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_n_8_lib4
+_kernel_dgemv_n_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_n_8_lib4
+ .def kernel_dgemv_n_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_n_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_N_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_scale_ab_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_scale_ab_8_lib4
+#endif
+#endif
+
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_n_8_lib4, .-kernel_dgemv_n_8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dgemv_t_8_lib4(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_t_8_lib4
+ .type kernel_dgemv_t_8_lib4, @function
+kernel_dgemv_t_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_t_8_lib4
+_kernel_dgemv_t_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_t_8_lib4
+ .def kernel_dgemv_t_8_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_t_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG5, %r13 // x
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_T_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_t_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_t_8_lib4
+#endif
+#endif
+
+
+ // call inner blender t
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // y
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_8_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_t_8_lib4, .-kernel_dgemv_t_8_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8
+// void kernel_dtrmv_un_8_lib4(int k, double *A, int sda, double *x, double *z);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmv_un_8_lib4
+ .type kernel_dtrmv_un_8_lib4, @function
+kernel_dtrmv_un_8_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmv_un_8_lib4
+_kernel_dtrmv_un_8_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmv_un_8_lib4
+ .def kernel_dtrmv_un_8_lib4; .scl 2; .type 32; .endef
+kernel_dtrmv_un_8_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dtrmv edge & dgemv kernel n
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG4, %r13 // x
+
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMV_UN_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmv_un_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmv_un_8_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_n_8_lib4
+#endif
+#endif
+
+
+ // call inner blender n
+
+#if MACRO_LEVEL>=1
+ INNER_BLENDER_N_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_n_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_n_8_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // z
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_8_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmv_un_8_lib4, .-kernel_dtrmv_un_8_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_dgetrf_pivot_4_lib4.c b/kernel/avx2/kernel_dgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..b1329fe
--- /dev/null
+++ b/kernel/avx2/kernel_dgetrf_pivot_4_lib4.c
@@ -0,0 +1,1435 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <mmintrin.h>
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#include <smmintrin.h> // SSE4
+#include <immintrin.h> // AVX
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+// C numering (starting from zero) in the ipiv
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ __m128d
+ max0, max1, msk0, imx0, imx1,
+ inv;
+
+
+ __m256d
+ lft, msk,
+ sgn, vna, max, imx, idx,
+ ones,
+ tmp,
+ a_0,
+ b_0, b_1, b_2,
+ scl,
+ c_0,
+ d_0;
+
+ double
+ dlft;
+
+ sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+ vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+ lft = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+ double
+ tmp0;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ int B_pref = bs*sda;
+
+
+ // first column
+
+ // find pivot
+ pB = &pA[0+bs*0];
+ idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ k = 0;
+ for( ; k<m-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for( ; k<m-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<m)
+ {
+ dlft = m-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ a_0 = _mm256_load_pd( &pB[0] );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ inv = _mm_loaddup_pd( &pA[0+bs*0] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[0], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[0] = 0.0;
+ }
+
+
+ // second column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ c_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x1 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ _mm256_store_pd( &pA[0+bs*1], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[1] = idamax+1;
+ if(tmp0!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ inv = _mm_loaddup_pd( &pA[1+bs*1] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[1], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[1] = 0.0;
+ }
+
+
+ // third column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x3 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ _mm256_store_pd( &pA[0+bs*2], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[2] = idamax+2;
+ if(tmp0!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ inv = _mm_loaddup_pd( &pA[2+bs*2] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[2], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[2] = 0.0;
+ }
+
+
+ // fourth column
+
+ // scale & correct & find pivot
+ idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x1 );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x3 );
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+ a_0 = _mm256_blend_pd( tmp, a_0, 0x7 );
+ b_2 = _mm256_permute_pd( b_2, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blend_pd( tmp, c_0, 0x7 );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ _mm256_store_pd( &pA[0+bs*3], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[3] = idamax+3;
+ if(tmp0!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ inv = _mm_loaddup_pd( &pA[3+bs*3] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[3], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[3] = 0.0;
+ }
+
+ // scale
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ tmp = _mm256_mul_pd( c_0, scl );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+// pB += B_pref;
+ }
+
+ return;
+
+ }
+
+
+
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ __m128d
+ max0, max1, msk0, imx0, imx1,
+ inv;
+
+
+ __m256d
+ lft, msk,
+ sgn, vna, max, imx, idx,
+ ones,
+ tmp,
+ a_0,
+ b_0, b_1, b_2,
+ scl,
+ c_0,
+ d_0;
+
+ double
+ dlft;
+
+ sgn = _mm256_set_pd( -0.0, -0.0, -0.0, -0.0 );
+ vna = _mm256_set_pd( 4.0, 4.0, 4.0, 4.0 );
+ lft = _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ ones = _mm256_set_pd( 1.0, 1.0, 1.0, 1.0 );
+
+ double
+ tmp0;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ int B_pref = bs*sda;
+
+
+ // first column
+
+ // find pivot
+ pB = &pA[0+bs*0];
+ idx = lft; // _mm256_set_pd( 3.2, 2.2, 1.2, 0.2 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ k = 0;
+ for( ; k<m-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0] );
+// __builtin_prefetch( pB+2*B_pref );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for( ; k<m-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0] );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<m)
+ {
+ dlft = m-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ a_0 = _mm256_load_pd( &pB[0] );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_blendv_pd( a_0, sgn, msk );
+ a_0 = _mm256_andnot_pd( sgn, a_0 ); // abs
+ msk = _mm256_cmp_pd( a_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, a_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ inv = _mm_loaddup_pd( &pA[0+bs*0] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[0], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[0] = 0.0;
+ }
+
+ if(n==1)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+
+ // second column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 2.2, 1.2, 0.2, -0.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ c_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x1 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ d_0 = _mm256_blend_pd( d_0, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*0], a_0 );
+ _mm256_store_pd( &pA[0+bs*1], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x1 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*0], a_0 );
+ _mm256_store_pd( &pB[0+bs*1], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>1)
+ {
+ ipiv[1] = idamax+1;
+ if(tmp0!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ inv = _mm_loaddup_pd( &pA[1+bs*1] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[1], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[1] = 0.0;
+ }
+ }
+
+ if(n==2)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+ // third column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 1.2, 0.2, -0.8, -1.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*2] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x3 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*1], a_0 );
+ _mm256_store_pd( &pA[0+bs*2], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x3 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*1], a_0 );
+ _mm256_store_pd( &pB[0+bs*2], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>2)
+ {
+ ipiv[2] = idamax+2;
+ if(tmp0!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ inv = _mm_loaddup_pd( &pA[2+bs*2] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[2], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[2] = 0.0;
+ }
+ }
+
+ if(n==3)
+ {
+ // scale & return
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ // pB += B_pref;
+ }
+
+ return;
+ }
+
+ // fourth column
+
+ // scale & correct & find pivot
+ dlft = m;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ idx = _mm256_set_pd( 0.2, -0.8, -1.8, -2.8 );
+ max = _mm256_setzero_pd();
+ imx = _mm256_setzero_pd();
+ c_0 = _mm256_load_pd( &pA[0+bs*3] );
+ b_0 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_0 = _mm256_permute_pd( b_0, 0x0 );
+ a_0 = _mm256_load_pd( &pA[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x1 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ b_1 = _mm256_permute2f128_pd( c_0, c_0, 0x00 );
+ b_1 = _mm256_permute_pd( b_1, 0xf );
+ a_0 = _mm256_load_pd( &pA[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x3 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ a_0 = _mm256_load_pd( &pA[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ b_2 = _mm256_permute2f128_pd( c_0, c_0, 0x11 );
+ tmp = _mm256_blend_pd( tmp, a_0, 0x7 );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ b_2 = _mm256_permute_pd( b_2, 0x0 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ tmp = _mm256_sub_pd( c_0, tmp );
+ tmp = _mm256_blend_pd( tmp, c_0, 0x7 );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pA[0+bs*2], a_0 );
+ _mm256_store_pd( &pA[0+bs*3], c_0 );
+ c_0 = _mm256_blend_pd( c_0, sgn, 0x7 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+// __builtin_prefetch( pB+2*B_pref );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+// __builtin_prefetch( pB+2*B_pref+8 );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ a_0 = _mm256_mul_pd( a_0, scl );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ c_0 = _mm256_sub_pd( c_0, tmp );
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+ idx = _mm256_add_pd( idx, vna );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ a_0 = _mm256_load_pd( &pB[0+bs*0] );
+ tmp = _mm256_mul_pd( a_0, b_0 );
+ d_0 = _mm256_sub_pd( c_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*1] );
+ tmp = _mm256_mul_pd( a_0, b_1 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ a_0 = _mm256_load_pd( &pB[0+bs*2] );
+ tmp = _mm256_mul_pd( a_0, scl );
+ a_0 = _mm256_blendv_pd( tmp, a_0, msk );
+ tmp = _mm256_mul_pd( a_0, b_2 );
+ d_0 = _mm256_sub_pd( d_0, tmp );
+ c_0 = _mm256_blendv_pd( d_0, c_0, msk);
+ _mm256_store_pd( &pB[0+bs*2], a_0 );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ c_0 = _mm256_blendv_pd( c_0, sgn, msk );
+ c_0 = _mm256_andnot_pd( sgn, c_0 ); // abs
+ msk = _mm256_cmp_pd( c_0, max, 14 ); // >
+ max = _mm256_blendv_pd( max, c_0, msk );
+ imx = _mm256_blendv_pd( imx, idx, msk );
+// idx = _mm256_add_pd( idx, vna );
+// pB += B_pref;
+ }
+ max0 = _mm256_extractf128_pd( max, 0x0 );
+ max1 = _mm256_extractf128_pd( max, 0x1 );
+ imx0 = _mm256_extractf128_pd( imx, 0x0 ); // lower indexes in case of identical max value
+ imx1 = _mm256_extractf128_pd( imx, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ max1 = _mm_permute_pd( max0, 0x1 );
+ imx1 = _mm_permute_pd( imx0, 0x1 );
+ msk0 = _mm_cmp_pd( max1, max0, 14 );
+ max0 = _mm_blendv_pd( max0, max1, msk0 );
+ imx0 = _mm_blendv_pd( imx0, imx1, msk0 );
+ _mm_store_sd( &tmp0, max0 );
+ idamax = _mm_cvtsd_si32( imx0 );
+
+ // compute scaling
+ if(m>3)
+ {
+ ipiv[3] = idamax+3;
+ if(tmp0!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ inv = _mm_loaddup_pd( &pA[3+bs*3] );
+ inv = _mm_div_pd( _mm256_castpd256_pd128( ones ), inv );
+ scl = _mm256_permute2f128_pd( _mm256_castpd128_pd256( inv ), _mm256_castpd128_pd256( inv ), 0x00 );
+ _mm_store_sd( &inv_diag_A[3], inv );
+ }
+ else
+ {
+ scl = ones;
+ inv_diag_A[3] = 0.0;
+ }
+ }
+
+ // scale
+ pB = pA + B_pref;
+ k = 0;
+ for(; k<ma-7; k+=8)
+ {
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+// __builtin_prefetch( pB+2*B_pref+8 );
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ for(; k<ma-3; k+=4)
+ {
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ c_0 = _mm256_mul_pd( c_0, scl );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+ pB += B_pref;
+ }
+ if(k<ma)
+ {
+ dlft = ma-k;
+ msk = _mm256_broadcast_sd( &dlft );
+ msk = _mm256_cmp_pd( lft, msk, 14 ); // >
+ c_0 = _mm256_load_pd( &pB[0+bs*3] );
+ tmp = _mm256_mul_pd( c_0, scl );
+ c_0 = _mm256_blendv_pd( tmp, c_0, msk );
+ _mm256_store_pd( &pB[0+bs*3], c_0 );
+// pB += B_pref;
+ }
+
+ return;
+
+ }
+
+
diff --git a/kernel/avx2/kernel_dsymv_6_lib4.S b/kernel/avx2/kernel_dsymv_6_lib4.S
new file mode 100644
index 0000000..7a4411c
--- /dev/null
+++ b/kernel/avx2/kernel_dsymv_6_lib4.S
@@ -0,0 +1,996 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4 <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5 <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm4 <- [z_t_4a z_t_4b z_t_4c z_t_4d]
+// ymm5 <- [z_t_5a z_t_5b z_t_5c z_t_5d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- x_n_4
+// ymm11 <- x_n_5
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemv_add_nt_6_lib4, @function
+inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemv_add_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemv_add_nt_6_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemv_add_nt_6_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $4, %r10d
+ jl 0f // clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovapd 0(%r11), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm6, %ymm13
+
+ subl $4, %r10d
+
+ vmovapd 32(%r11), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm7, %ymm13
+
+ vmovapd 64(%r11), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm8, %ymm13
+
+ vmovapd 96(%r11), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm9, %ymm13
+
+ vmovapd 128(%r11), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm14, %ymm10, %ymm13
+
+ vmovapd 160(%r11), %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm14, %ymm11, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ cmpl $3, %r10d
+
+ jg 1b // main loop
+
+
+ // consider clean-up
+ cmpl $0, %r10d
+ jle 2f // return
+
+0: // clean-up
+
+ vcvtsi2sd %r10d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovupd .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovupd LC02(%rip), %ymm13
+#endif
+ vmovddup %xmm14, %xmm14
+ vinsertf128 $1, %xmm14, %ymm14, %ymm14
+ vsubpd %ymm14, %ymm13, %ymm15
+
+ vmaskmovpd 0(%r13), %ymm15, %ymm12
+ vmaskmovpd 0(%r14), %ymm15, %ymm13
+
+ vmaskmovpd 0(%r11), %ymm15, %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm0
+ vfmadd231pd %ymm14, %ymm6, %ymm13
+
+ vmaskmovpd 32(%r11), %ymm15, %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm1
+ vfmadd231pd %ymm14, %ymm7, %ymm13
+
+ vmaskmovpd 64(%r11), %ymm15, %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm2
+ vfmadd231pd %ymm14, %ymm8, %ymm13
+
+ vmaskmovpd 96(%r11), %ymm15, %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm3
+ vfmadd231pd %ymm14, %ymm9, %ymm13
+
+ vmaskmovpd 128(%r11), %ymm15, %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm4
+ vfmadd231pd %ymm14, %ymm10, %ymm13
+
+ vmaskmovpd 160(%r11), %ymm15, %ymm14
+ vfmadd231pd %ymm14, %ymm12, %ymm5
+ vfmadd231pd %ymm14, %ymm11, %ymm13
+
+ vmaskmovpd %ymm13, %ymm15, 0(%r14)
+
+ sall $3, %r10d
+ addq %r10, %r11
+ addq %r10, %r13
+ addq %r10, %r14
+ xorl %r10d, %r10d
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemv_add_nt_6_lib4, .-inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+
+
+
+
+#if 0
+
+// TODO
+// common inner routine with file scope
+//
+// input arguments:
+// r10 <- kmax
+// r11 <- A
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t
+// r14 <- z_n
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- kmax-4
+// r11 <- A+4*k*sizeof(double)
+// r12 <- bs*sda*sizeof(double) = 32*sda
+// r13 <- x_t+k*sizeof(double)
+// r14 <- z_n+k*sizeof(double)
+// ymm0 <- [z_t_0a z_t_0b z_t_0c z_t_0d]
+// ymm1 <- [z_t_1a z_t_1b z_t_1c z_t_1d]
+// ymm2 <- [z_t_2a z_t_2b z_t_2c z_t_2d]
+// ymm3 <- [z_t_3a z_t_3b z_t_3c z_t_3d]
+// ymm6 <- x_n_0
+// ymm7 <- x_n_1
+// ymm8 <- x_n_2
+// ymm9 <- x_n_3
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dsymv_add_nt_4_lib4, @function
+inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dsymv_add_nt_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dsymv_add_nt_4_lib4; .scl 2; .type 32; .endef
+inner_edge_dsymv_add_nt_4_lib4:
+#endif
+#endif
+
+ vmovupd 0(%r13), %ymm12
+ vmovupd 0(%r14), %ymm13
+
+ vmovapd 0(%r11), %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm0, %ymm15, %ymm0
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm6, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 32(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x1, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm1, %ymm15, %ymm1
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm7, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 64(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm2, %ymm15, %ymm2
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm8, %ymm15
+ vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovapd 96(%r11), %ymm14
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x7, %ymm15, %ymm14, %ymm14
+ vmulpd %ymm14, %ymm12, %ymm15
+ vaddpd %ymm3, %ymm15, %ymm3
+// vxorpd %ymm15, %ymm15, %ymm15
+// vblendpd $0x0, %ymm14, %ymm15, %ymm14
+// vmulpd %ymm14, %ymm9, %ymm15
+// vaddpd %ymm13, %ymm15, %ymm13
+
+ vmovupd %ymm13, 0(%r14)
+
+ addq %r12, %r11
+ addq $32, %r13
+ addq $32, %r14
+
+ subq $4, %r10
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dsymv_add_nt_4_lib4, .-inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm4 <- [z4a z4b z4c z4d]
+// ymm5 <- [z5a z5b z5c z5d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- [z4 z5 xx xx]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_ab_6_lib4, @function
+inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_ab_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_ab_6_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_ab_6_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vhaddpd %ymm5, %ymm4, %ymm4
+// vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vextractf128 $0x1, %ymm4, %xmm5
+ vaddpd %ymm0, %ymm1, %ymm0
+ vaddpd %ymm4, %ymm5, %ymm4
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+ vmulpd %ymm4, %ymm15, %ymm1
+
+ // beta
+ vbroadcastsd 0(%r11), %ymm15
+ vmovupd 0(%r12), %ymm14
+ vmovupd 32(%r12), %ymm13
+ vfmadd231pd %ymm15, %ymm14, %ymm0
+ vfmadd231pd %ymm15, %ymm13, %ymm1
+
+ vxorpd %ymm15, %ymm15, %ymm15
+ vblendpd $0x3, %ymm1, %ymm15, %ymm1
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_ab_6_lib4, .-inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+
+
+
+#if 0
+
+//TODO
+// common inner routine with file scope
+//
+// blend for ta==t, scale for generic alpha and beta=1.0
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0a z0b z0c z0d]
+// ymm1 <- [z1a z1b z1c z1d]
+// ymm2 <- [z2a z2b z2c z2d]
+// ymm3 <- [z3a z3b z3c z3d]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- y
+// ymm0 <- [z0 z1 z2 z3]
+// ymm1 <- dirty
+// ymm2 <- dirty
+// ymm3 <- dirty
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_t_scale_a1_4_lib4, @function
+inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_t_scale_a1_4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_t_scale_a1_4_lib4; .scl 2; .type 32; .endef
+inner_blend_t_scale_a1_4_lib4:
+#endif
+#endif
+
+ // reduction
+ vhaddpd %ymm1, %ymm0, %ymm0
+ vhaddpd %ymm3, %ymm2, %ymm2
+ vperm2f128 $0x2, %ymm0, %ymm2, %ymm1
+ vperm2f128 $0x13, %ymm0, %ymm2, %ymm0
+ vaddpd %ymm0, %ymm1, %ymm0
+
+ // alpha
+ vbroadcastsd 0(%r10), %ymm15
+ vmulpd %ymm0, %ymm15, %ymm0
+
+ // beta
+ vmovupd 0(%r11), %ymm14
+ vaddpd %ymm0, %ymm14, %ymm0
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_t_scale_a1_4_lib4, .-inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+#endif
+
+
+
+
+// common inner routine with file scope
+//
+// store
+//
+// input arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+//
+// output arguments:
+// r10 <- z
+// ymm0 <- [z0 z1 z2 z3]
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_6_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_6_lib4, @function
+inner_store_6_lib4:
+#elif defined(OS_MAC)
+_inner_store_6_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_6_lib4; .scl 2; .type 32; .endef
+inner_store_6_lib4:
+#endif
+#endif
+
+ vmovupd %ymm0, 0(%r10)
+ vmovupd %xmm1, 32(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_6_lib4, .-inner_store_6_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp_32 rsp_40
+// void kernel_dgemv_nt_6_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemv_nt_6_lib4
+ .type kernel_dgemv_nt_6_lib4, @function
+kernel_dgemv_nt_6_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemv_nt_6_lib4
+_kernel_dgemv_nt_6_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemv_nt_6_lib4
+ .def kernel_dgemv_nt_6_lib4; .scl 2; .type 32; .endef
+kernel_dgemv_nt_6_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha_n
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG6, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+ vbroadcastsd 32(%r10), %ymm10
+ vmulpd %ymm15, %ymm10, %ymm10
+ vbroadcastsd 40(%r10), %ymm11
+ vmulpd %ymm15, %ymm11, %ymm11
+
+
+ // inner kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG5, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG7, %r13 // x_t
+ movq ARG10, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_6_lib4
+#endif
+#endif
+
+
+ // inner blend n scale ab
+
+ movq ARG3, %r10 // alpha_t
+ movq ARG8, %r11 // beta_t
+ movq ARG9, %r12 // y_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_AB_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_ab_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_ab_6_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG11, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_6_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_6_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_6_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemv_nt_6_lib4, .-kernel_dgemv_nt_6_lib4
+#endif
+
+
+
+
+
+#if 0
+// TODO
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x_n, double *x_t, double *z_n, double *z_t);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsymv_l_4_lib4
+ .type kernel_dsymv_l_4_lib4, @function
+kernel_dsymv_l_4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsymv_l_4_lib4
+_kernel_dsymv_l_4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsymv_l_4_lib4
+ .def kernel_dsymv_l_4_lib4; .scl 2; .type 32; .endef
+kernel_dsymv_l_4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers y_t
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+ // initialize x_n
+ movq ARG2, %r10 // alpha
+ vbroadcastsd 0(%r10), %ymm15
+
+ movq ARG5, %r10 // x_n
+
+ vbroadcastsd 0(%r10), %ymm6
+ vmulpd %ymm15, %ymm6, %ymm6
+ vbroadcastsd 8(%r10), %ymm7
+ vmulpd %ymm15, %ymm7, %ymm7
+ vbroadcastsd 16(%r10), %ymm8
+ vmulpd %ymm15, %ymm8, %ymm8
+ vbroadcastsd 24(%r10), %ymm9
+ vmulpd %ymm15, %ymm9, %ymm9
+
+
+ // inner edge dsyrk & kernel dgemv nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+// movslq %r12d, %r12
+ movq ARG6, %r13 // x_t
+ movq ARG7, %r14 // z_n
+
+#if MACRO_LEVEL>=2
+ INNER_EDGE_DSYMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dsymv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dsymv_add_nt_4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemv_add_nt_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemv_add_nt_4_lib4
+#endif
+#endif
+
+
+ // call inner blend n scale ab
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_T_SCALE_A1_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_t_scale_a1_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_t_scale_a1_4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // z_t
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsymv_l_4_lib4, .-kernel_dsymv_l_4_lib4
+#endif
+
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
+
+
+
+
diff --git a/kernel/avx2/kernel_sgemm_16x4_lib8.S b/kernel/avx2/kernel_sgemm_16x4_lib8.S
new file mode 100644
index 0000000..857fb11
--- /dev/null
+++ b/kernel/avx2/kernel_sgemm_16x4_lib8.S
@@ -0,0 +1,6811 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_16x4_lib8, @function
+inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_16x4_lib8:
+#endif
+#endif
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+ vmovaps 0(%r11, %r12, 1), %ymm14 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd -32(%r11), %ymm10 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd -32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r13
+
+ // unroll 0
+ vbroadcastss -32(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd -32(%r11), %ymm10 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd -32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r13
+
+ // unroll 0
+ vbroadcastss -32(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // a
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 0(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vbroadcastss 4(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ subl $1, %r10d
+ vbroadcastss 8(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ addq $32, %r11
+ vbroadcastss 12(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+
+ // preload
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vfmadd231ps %ymm8, %ymm14, %ymm0
+ vfmadd231ps %ymm9, %ymm14, %ymm4
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmovaps 32(%r11), %ymm10 // A0
+ vfmadd231ps %ymm8, %ymm14, %ymm1
+ vfmadd231ps %ymm9, %ymm14, %ymm5
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+
+ vmovaps 32(%r15), %ymm11 // A1
+ vfmadd231ps %ymm8, %ymm14, %ymm2
+ vfmadd231ps %ymm9, %ymm14, %ymm6
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+
+ vfmadd231ps %ymm8, %ymm14, %ymm3
+ vfmadd231ps %ymm9, %ymm14, %ymm7
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+
+
+ // unroll 1
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm14, %ymm0
+ vfmadd231ps %ymm11, %ymm14, %ymm4
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+
+ vmovaps 64(%r11), %ymm8 // A0
+ vfmadd231ps %ymm10, %ymm14, %ymm1
+ vfmadd231ps %ymm11, %ymm14, %ymm5
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+
+ vmovaps 64(%r15), %ymm9 // A1
+ vfmadd231ps %ymm10, %ymm14, %ymm2
+ vfmadd231ps %ymm11, %ymm14, %ymm6
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+
+ vfmadd231ps %ymm10, %ymm14, %ymm3
+ vfmadd231ps %ymm11, %ymm14, %ymm7
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+
+ // unroll 2
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vfmadd231ps %ymm8, %ymm14, %ymm0
+ vfmadd231ps %ymm9, %ymm14, %ymm4
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+
+ addq $128, %r13
+ vmovaps 96(%r11), %ymm10 // A0
+ vfmadd231ps %ymm8, %ymm14, %ymm1
+ vfmadd231ps %ymm9, %ymm14, %ymm5
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+
+ addq $128, %r11
+ vmovaps 96(%r15), %ymm11 // A1
+ vfmadd231ps %ymm8, %ymm14, %ymm2
+ vfmadd231ps %ymm9, %ymm14, %ymm6
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+
+ addq $128, %r15
+ vfmadd231ps %ymm8, %ymm14, %ymm3
+ vfmadd231ps %ymm9, %ymm14, %ymm7
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+
+
+ // unroll 3
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm14, %ymm0
+ vfmadd231ps %ymm11, %ymm14, %ymm4
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+
+ vmovaps 0(%r11), %ymm8 // A0
+ vfmadd231ps %ymm10, %ymm14, %ymm1
+ vfmadd231ps %ymm11, %ymm14, %ymm5
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+
+ vmovaps 0(%r15), %ymm9 // A1
+ vfmadd231ps %ymm10, %ymm14, %ymm2
+ vfmadd231ps %ymm11, %ymm14, %ymm6
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+
+ vfmadd231ps %ymm10, %ymm14, %ymm3
+ vfmadd231ps %ymm11, %ymm14, %ymm7
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastf128 32(%r13), %ymm13 // B
+ vfmadd231ps %ymm8, %ymm14, %ymm0
+ vfmadd231ps %ymm9, %ymm14, %ymm4
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+
+ subl $4, %r10d
+ vmovaps 32(%r11), %ymm10 // A0
+ vfmadd231ps %ymm8, %ymm14, %ymm1
+ vfmadd231ps %ymm9, %ymm14, %ymm5
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+
+ vmovaps 32(%r15), %ymm11 // A1
+ vfmadd231ps %ymm8, %ymm14, %ymm2
+ vfmadd231ps %ymm9, %ymm14, %ymm6
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+
+ vfmadd231ps %ymm8, %ymm14, %ymm3
+ vfmadd231ps %ymm9, %ymm14, %ymm7
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+
+
+ // unroll 1
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm14, %ymm0
+ vfmadd231ps %ymm11, %ymm14, %ymm4
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+
+ vmovaps 64(%r11), %ymm8 // A0
+ vfmadd231ps %ymm10, %ymm14, %ymm1
+ vfmadd231ps %ymm11, %ymm14, %ymm5
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+
+ vmovaps 64(%r15), %ymm9 // A1
+ vfmadd231ps %ymm10, %ymm14, %ymm2
+ vfmadd231ps %ymm11, %ymm14, %ymm6
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+
+ vfmadd231ps %ymm10, %ymm14, %ymm3
+ vfmadd231ps %ymm11, %ymm14, %ymm7
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+
+ // unroll 2
+ vbroadcastf128 96(%r13), %ymm13 // B
+ vfmadd231ps %ymm8, %ymm14, %ymm0
+ vfmadd231ps %ymm9, %ymm14, %ymm4
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+
+ addq $128, %r13
+ vmovaps 96(%r11), %ymm10 // A0
+ vfmadd231ps %ymm8, %ymm14, %ymm1
+ vfmadd231ps %ymm9, %ymm14, %ymm5
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+
+ addq $128, %r11
+ vmovaps 96(%r15), %ymm11 // A1
+ vfmadd231ps %ymm8, %ymm14, %ymm2
+ vfmadd231ps %ymm9, %ymm14, %ymm6
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+
+ addq $128, %r15
+ vfmadd231ps %ymm8, %ymm14, %ymm3
+ vfmadd231ps %ymm9, %ymm14, %ymm7
+ vshufps $0x00, %ymm13, %ymm13, %ymm14
+
+
+ // unroll 3
+// vbroadcastf128 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm14, %ymm0
+ vfmadd231ps %ymm11, %ymm14, %ymm4
+ vshufps $0x55, %ymm13, %ymm13, %ymm14
+
+// vmovaps 0(%r11), %ymm8 // A0
+ vfmadd231ps %ymm10, %ymm14, %ymm1
+ vfmadd231ps %ymm11, %ymm14, %ymm5
+ vshufps $0xaa, %ymm13, %ymm13, %ymm14
+
+// vmovaps 0(%r15), %ymm9 // A1
+ vfmadd231ps %ymm10, %ymm14, %ymm2
+ vfmadd231ps %ymm11, %ymm14, %ymm6
+ vshufps $0xff, %ymm13, %ymm13, %ymm14
+
+ vfmadd231ps %ymm10, %ymm14, %ymm3
+ vfmadd231ps %ymm11, %ymm14, %ymm7
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm8 // A0
+ vmovaps 0(%r15), %ymm9 // A1
+ vshufps $0x00, %ymm12, %ymm12, %ymm14
+ vfmadd231ps %ymm8, %ymm14, %ymm0
+ vfmadd231ps %ymm9, %ymm14, %ymm4
+
+ vshufps $0x55, %ymm12, %ymm12, %ymm14
+ vfmadd231ps %ymm8, %ymm14, %ymm1
+ vfmadd231ps %ymm9, %ymm14, %ymm5
+
+ vshufps $0xaa, %ymm12, %ymm12, %ymm14
+ vfmadd231ps %ymm8, %ymm14, %ymm2
+ vfmadd231ps %ymm9, %ymm14, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r13
+ addq $32, %r15
+
+ vshufps $0xff, %ymm12, %ymm12, %ymm14
+ vfmadd231ps %ymm8, %ymm14, %ymm3
+ vfmadd231ps %ymm9, %ymm14, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_16x4_lib8, .-inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_16x4_lib8, @function
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+ vmovaps 0(%r11, %r12, 1), %ymm14 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm0
+ vfnmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm1
+ vfnmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm2
+ vfnmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm3
+ vfnmadd231ps %ymm11, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd -32(%r11), %ymm10 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd -32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r13
+
+ // unroll 0
+ vbroadcastss -32(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm0
+ vfnmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm1
+ vfnmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm2
+ vfnmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm3
+ vfnmadd231ps %ymm11, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm0
+ vfnmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm1
+ vfnmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm2
+ vfnmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm3
+ vfnmadd231ps %ymm11, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd -32(%r11), %ymm10 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd -32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r13
+
+ // unroll 0
+ vbroadcastss -32(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm0
+ vfnmadd231ps %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm1
+ vfnmadd231ps %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm2
+ vfnmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfnmadd231ps %ymm10, %ymm12, %ymm3
+ vfnmadd231ps %ymm11, %ymm12, %ymm7
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // a
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 0(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vbroadcastss 4(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ subl $1, %r10d
+ vbroadcastss 8(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ addq $32, %r11
+ vbroadcastss 12(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_16x4_lib8, .-inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_16x4_lib8, @function
+inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+ vmovaps 0(%r11, %r12, 1), %ymm14 // A
+
+ cmpl $8, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 1) // software prefetch
+ prefetcht0 64(%r13, %r14, 1) // software prefetch
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 96(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ subl $8, %r10d
+
+ // unroll 1
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 100(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 104(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+
+ // unroll 3
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 128(%r11), %ymm13 // A
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 128(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 108(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 4
+ vbroadcastss 16(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 160(%r11), %ymm13 // A
+ vbroadcastss 48(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 160(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 80(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 112(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 5
+ vbroadcastss 20(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 192(%r11), %ymm13 // A
+ vbroadcastss 52(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 192(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 84(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 116(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 6
+ vbroadcastss 24(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 224(%r11), %ymm13 // A
+ vbroadcastss 56(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 224(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 88(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 120(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+ addq $256, %r11
+
+ // unroll 7
+ vbroadcastss 28(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss 60(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 92(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss -4(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+ addq %r14, %r13
+
+ cmpl $8, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $7, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 32(%r11), %ymm10 // A
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 32(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 96(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ subl $8, %r10d
+
+ // unroll 1
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 100(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 2
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovapd 96(%r11), %ymm10 // A
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vmovapd 96(%r11, %r12, 1), %ymm11 // A
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 104(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+
+ // unroll 3
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 128(%r11), %ymm13 // A
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 128(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 108(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 4
+ vbroadcastss 16(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 160(%r11), %ymm13 // A
+ vbroadcastss 48(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 160(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 80(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 112(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 5
+ vbroadcastss 20(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 192(%r11), %ymm13 // A
+ vbroadcastss 52(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 192(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 84(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 116(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+
+ // unroll 6
+ vbroadcastss 24(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+ vmovapd 224(%r11), %ymm13 // A
+ vbroadcastss 56(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+ vmovapd 224(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 88(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 120(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+ addq $256, %r11
+
+ // unroll 7
+ vbroadcastss 28(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm0
+ vfmadd231ps %ymm11, %ymm12, %ymm4
+// vmovapd 0(%r11), %ymm13 // A
+ vbroadcastss 60(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm1
+ vfmadd231ps %ymm11, %ymm12, %ymm5
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vbroadcastss 92(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm2
+ vfmadd231ps %ymm11, %ymm12, %ymm6
+ vbroadcastss 124(%r13), %ymm12 // B
+ vfmadd231ps %ymm10, %ymm12, %ymm3
+ vfmadd231ps %ymm11, %ymm12, %ymm7
+ addq %r14, %r13
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vbroadcastss 0(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vbroadcastss 32(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vbroadcastss 64(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vbroadcastss 96(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_16x4_lib8, .-inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_16x4_lib8, @function
+inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %ebx
+ subl %r15d, %ebx // 8-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,8-offsetB)
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r13 // B+offsetB*sizeof(float)
+
+1:
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vbroadcastss 0(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vbroadcastss 32(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vbroadcastss 64(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vbroadcastss 96(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_16x4_lib8, .-inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trmm_nn_rl_16x4_lib8, @function
+inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_16x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ movq %r13, %rbx // B
+ addq %rax, %rbx // B+offsetB*sizeof(float)
+
+
+ cmpl $4, %r15d
+ jg 1f
+
+ // offB==0, 1, 2, 3, 4
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 8(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 40(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+ vbroadcastss 72(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm2
+ vfmadd231ps %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+ cmpl $5, %r15d
+ jg 1f
+
+ // offB==5
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 8(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 40(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+ vbroadcastss 72(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm2
+ vfmadd231ps %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movl $0, %r15d // offsetB=0
+
+ jmp 0f // end
+
+
+1:
+ cmpl $6, %r15d
+ jg 1f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 32(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+ vbroadcastss 64(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm2
+ vfmadd231ps %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+// cmpl $7, %r15d
+// jg 0f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 32(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vmovaps 0(%r11, %r12, 1), %ymm9
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm0
+ vfmadd231ps %ymm9, %ymm12, %ymm4
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm1
+ vfmadd231ps %ymm9, %ymm12, %ymm5
+ vbroadcastss 68(%rbx), %ymm12
+ vfmadd231ps %ymm8, %ymm12, %ymm2
+ vfmadd231ps %ymm9, %ymm12, %ymm6
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+// jmp 0f // end
+
+
+ // end
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trmm_nn_rl_16x4_lib8, .-inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_16x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_16x4_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastss 4(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vbroadcastss 8(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vbroadcastss 12(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastss 40(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vbroadcastss 44(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastss 76(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_16x4_vs_lib8, .-inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_16x4_vs_lib8, @function
+inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_16x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_16x4_vs_lib8, .-inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_12x4_vs_lib8, @function
+inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_12x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vextractf128 $0x1, %ymm0, %xmm13
+// vpermilps $0x00, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm1, %xmm13
+ vpermilps $0x55, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vpermilps $0xaa, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilps $0xff, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_12x4_vs_lib8, .-inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_16x4_lib8, @function
+inner_scale_ab_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ movq %r12, %r15 // C1 <- C0
+ addq %r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_16x4_lib8, .-inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_16x4_gen_lib8, @function
+inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_16x4_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ movq %r13, %rax // C1 <- C0
+ addq %r14, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+
+ vmovaps 0(%rax), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vmovaps 32(%rax), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vmovaps 64(%rax), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vmovaps 96(%rax), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_16x4_gen_lib8, .-inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_16x4_lib8, @function
+inner_scale_a0_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_16x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_16x4_lib8, .-inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_16x4_lib8, @function
+inner_scale_11_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_16x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_lib8:
+#endif
+#endif
+
+ movq %r10, %r15 // C1 <- C0
+ addq %r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_16x4_lib8, .-inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_16x4_gen_lib8, @function
+inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_16x4_gen_lib8:
+#endif
+#endif
+
+ movq %r11, %rax // C1 <- C0
+ addq %r12, %rax // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ vmovaps 0(%rax), %ymm14
+ vaddps %ymm4, %ymm14, %ymm4
+ vmovaps 32(%rax), %ymm14
+ vaddps %ymm5, %ymm14, %ymm5
+ vmovaps 64(%rax), %ymm14
+ vaddps %ymm6, %ymm14, %ymm6
+ vmovaps 96(%rax), %ymm14
+ vaddps %ymm7, %ymm14, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r12, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_16x4_gen_lib8, .-inner_scale_11_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_lib8, @function
+inner_store_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_lib8:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r15)
+ vmovaps %ymm5, 32(%r15)
+ vmovaps %ymm6, 64(%r15)
+ vmovaps %ymm7, 96(%r15)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_lib8, .-inner_store_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_vs_lib8, @function
+inner_store_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 7f // end
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 7f // end
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 7f // end
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+ jmp 0f
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_vs_lib8, .-inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_16x4_gen_lib8, @function
+inner_store_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_16x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute D1
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(float)
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ cmpl $2, %r15d
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%rbx)
+ jl 7f // end
+ cmpl $3, %r15d
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%rbx)
+ jl 7f // end
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%rbx)
+ je 7f // end
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%rbx)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbp // D1
+ addq %r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_16x4_gen_lib8, .-inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_lib8, @function
+inner_store_l_16x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_lib8:
+#endif
+#endif
+
+ vmovaps 32(%r10), %ymm12
+ vmovaps 64(%r10), %ymm13
+ vmovaps 96(%r10), %ymm14
+
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vblendps $0x03, %ymm13, %ymm2, %ymm2
+ vblendps $0x07, %ymm14, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_lib8, .-inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_vs_lib8, @function
+inner_store_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_vs_lib8, .-inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_16X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_16x4_gen_lib8, @function
+inner_store_l_16x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_16x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_16x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%r11, %r12, 1)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_16x4_gen_lib8, .-inner_store_l_16x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_lib8, @function
+inner_store_l_12x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_lib8:
+#endif
+#endif
+
+ vmovaps 0(%r10), %ymm12
+ vmovaps 32(%r10), %ymm13
+ vmovaps 64(%r10), %ymm14
+ vmovaps 96(%r10), %ymm15
+
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vblendps $0x1f, %ymm13, %ymm1, %ymm1
+ vblendps $0x3f, %ymm14, %ymm2, %ymm2
+ vblendps $0x7f, %ymm15, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_lib8, .-inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_vs_lib8, @function
+inner_store_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r10)
+ vmaskmovps %ymm4, %ymm15, 0(%r10, %r11, 1)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmaskmovps %ymm5, %ymm15, 32(%r10, %r11, 1)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmaskmovps %ymm6, %ymm15, 64(%r10, %r11, 1)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmaskmovps %ymm7, %ymm15, 96(%r10, %r11, 1)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_vs_lib8, .-inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_12X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_12x4_gen_lib8, @function
+inner_store_l_12x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_12x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_12x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_12x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC01(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC01(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmovaps 0(%r11), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmaskmovps %ymm4, %ymm15, 0(%r11, %r12, 1)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmaskmovps %ymm5, %ymm15, 32(%r11, %r12, 1)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmaskmovps %ymm6, %ymm15, 64(%r11, %r12, 1)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmaskmovps %ymm7, %ymm15, 96(%r11, %r12, 1)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_12x4_gen_lib8, .-inner_store_l_12x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_nt_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_lib8
+ .type kernel_sgemm_nt_16x4_lib8, @function
+kernel_sgemm_nt_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_lib8
+_kernel_sgemm_nt_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_16x4_lib8
+ .def kernel_sgemm_nt_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_lib8, .-kernel_sgemm_nt_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_sgemm_nt_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_vs_lib8
+ .type kernel_sgemm_nt_16x4_vs_lib8, @function
+kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_vs_lib8
+_kernel_sgemm_nt_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_16x4_vs_lib8
+ .def kernel_sgemm_nt_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_vs_lib8, .-kernel_sgemm_nt_16x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_sgemm_nt_16x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_16x4_gen_lib8
+ .type kernel_sgemm_nt_16x4_gen_lib8, @function
+kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_16x4_gen_lib8
+_kernel_sgemm_nt_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_16x4_gen_lib8
+ .def kernel_sgemm_nt_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_16x4_gen_lib8, .-kernel_sgemm_nt_16x4_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_nn_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_lib8
+ .type kernel_sgemm_nn_16x4_lib8, @function
+kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_lib8
+_kernel_sgemm_nn_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_lib8
+ .def kernel_sgemm_nn_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_lib8, .-kernel_sgemm_nn_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_nn_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_vs_lib8
+ .type kernel_sgemm_nn_16x4_vs_lib8, @function
+kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_vs_lib8
+_kernel_sgemm_nn_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_vs_lib8
+ .def kernel_sgemm_nn_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_vs_lib8, .-kernel_sgemm_nn_16x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88 rsp+96
+// void kernel_sgemm_nn_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_16x4_gen_lib8
+ .type kernel_sgemm_nn_16x4_gen_lib8, @function
+kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_16x4_gen_lib8
+_kernel_sgemm_nn_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_16x4_gen_lib8
+ .def kernel_sgemm_nn_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // offsetC
+ movq ARG10, %r13 // C
+ movq ARG11, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG12, %r10 // offsetD
+ movq ARG13, %r11 // D
+ movq ARG14, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG15, %r13 // m0
+ movq ARG16, %r14 // m1
+ movq ARG17, %r15 // n0
+ movq ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_16x4_gen_lib8, .-kernel_sgemm_nn_16x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_strsm_nt_rl_inv_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_16x4_lib8
+ .type kernel_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_16x4_lib8
+_kernel_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_16x4_lib8
+ .def kernel_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_16x4_lib8, .-kernel_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_strsm_nt_rl_inv_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+ .type kernel_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_16x4_vs_lib8
+ .def kernel_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // m1
+ movq ARG12, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_16x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_16x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+// void kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_12x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_12x4_lib8
+ .type kernel_spotrf_nt_l_12x4_lib8, @function
+kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_12x4_lib8
+_kernel_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_12x4_lib8
+ .def kernel_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_12x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_12x4_vs_lib8
+ .type kernel_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_12x4_vs_lib8
+_kernel_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_12x4_vs_lib8
+ .def kernel_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_12x4_lib8, .-kernel_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_16x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_16x4_lib8
+ .type kernel_spotrf_nt_l_16x4_lib8, @function
+kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_16x4_lib8
+_kernel_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_16x4_lib8
+ .def kernel_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_16x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_16x4_vs_lib8
+ .type kernel_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_16x4_vs_lib8
+_kernel_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_16x4_vs_lib8
+ .def kernel_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_16x4_lib8, .-kernel_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_12x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_12x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_12x4_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_12x4_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_12x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_16x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_16x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_16x4_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_16x4_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_16x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_16x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_16x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_16x4_lib8
+ .type kernel_ssyrk_nt_l_16x4_lib8, @function
+kernel_ssyrk_nt_l_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_16x4_lib8
+_kernel_ssyrk_nt_l_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_16x4_lib8
+ .def kernel_ssyrk_nt_l_16x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_16x4_lib8, .-kernel_ssyrk_nt_l_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_ssyrk_nt_l_16x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_16x4_vs_lib8
+ .type kernel_ssyrk_nt_l_16x4_vs_lib8, @function
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_16x4_vs_lib8
+_kernel_ssyrk_nt_l_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_16x4_vs_lib8
+ .def kernel_ssyrk_nt_l_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_16x4_vs_lib8, .-kernel_ssyrk_nt_l_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_12x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_12x4_lib8
+ .type kernel_ssyrk_nt_l_12x4_lib8, @function
+kernel_ssyrk_nt_l_12x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_12x4_lib8
+_kernel_ssyrk_nt_l_12x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_12x4_lib8
+ .def kernel_ssyrk_nt_l_12x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_12x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L12X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_12x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_12x4_lib8, .-kernel_ssyrk_nt_l_12x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 12 13
+// void kernel_ssyrk_nt_l_12x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_12x4_vs_lib8
+ .type kernel_ssyrk_nt_l_12x4_vs_lib8, @function
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_12x4_vs_lib8
+_kernel_ssyrk_nt_l_12x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_12x4_vs_lib8
+ .def kernel_ssyrk_nt_l_12x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_12x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_16X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_12X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_12x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_12x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_12x4_vs_lib8, .-kernel_ssyrk_nt_l_12x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strmm_nn_rl_16x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_lib8
+ .type kernel_strmm_nn_rl_16x4_lib8, @function
+kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_lib8
+_kernel_strmm_nn_rl_16x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_lib8
+ .def kernel_strmm_nn_rl_16x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_lib8, .-kernel_strmm_nn_rl_16x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_strmm_nn_rl_16x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_vs_lib8
+ .type kernel_strmm_nn_rl_16x4_vs_lib8, @function
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_vs_lib8
+_kernel_strmm_nn_rl_16x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_vs_lib8
+ .def kernel_strmm_nn_rl_16x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_vs_lib8, .-kernel_strmm_nn_rl_16x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_strmm_nn_rl_16x4_gen_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_16x4_gen_lib8
+ .type kernel_strmm_nn_rl_16x4_gen_lib8, @function
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_16x4_gen_lib8
+_kernel_strmm_nn_rl_16x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_16x4_gen_lib8
+ .def kernel_strmm_nn_rl_16x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_16x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_16x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_16x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_16X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_16x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_16x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // offsetD
+ movq ARG9, %r11 // D
+ movq ARG10, %r12 // sdd
+ sall $5, %r12d // 4*sdd*sizeof(double)
+ movq ARG11, %r13 // m0
+ movq ARG12, %r14 // m1
+ movq ARG13, %r15 // n0
+ movq ARG14, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_16X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_16x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_16x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_16x4_gen_lib8, .-kernel_strmm_nn_rl_16x4_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_sgemm_24x4_lib8.S b/kernel/avx2/kernel_sgemm_24x4_lib8.S
new file mode 100644
index 0000000..b3a027f
--- /dev/null
+++ b/kernel/avx2/kernel_sgemm_24x4_lib8.S
@@ -0,0 +1,7734 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_24x4_lib8, @function
+inner_kernel_gemm_add_nt_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_24x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_24x4_lib8:
+#endif
+#endif
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+ vmovaps 0(%r11, %r12, 1), %ymm14 // A
+ vmovaps 0(%r11, %r12, 2), %ymm15 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ subl $4, %r10d
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd -32(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd -32(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd -32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 96(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ addq $128, %r13
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 0(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ subl $4, %r10d
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd -32(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd -32(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd -32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 96(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ addq $128, %r13
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+// vmovapd 0(%r11), %ymm13 // A
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+// vmovapd 0(%r11, %r12, 2), %ymm15 // A
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // a
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A
+ vbroadcastss 0(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 4(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ subl $1, %r10d
+ vbroadcastss 8(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ addq $32, %r11
+ vbroadcastss 12(%r13), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ movq %r11, %r15 // A1 <- A0
+ addq %r12, %r15 // A1 <- A0 + 4*sda*sizeof(float)
+ movq %r15, %rax // A2 <- A1
+ addq %r12, %rax // A2 <- A1 + 4*sda*sizeof(float)
+
+ // preload
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%r11), %ymm13 // A0
+ vmovaps 0(%r15), %ymm14 // A1
+ vmovaps 0(%rax), %ymm15 // A2
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 32(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 32(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 32(%r13), %ymm12 // B
+ vmovaps 32(%rax), %ymm15 // A2
+
+
+ // unroll 1
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 64(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 64(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vmovaps 64(%rax), %ymm15 // A2
+
+
+ // unroll 2
+ subl $4, %r10d
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 96(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 96(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 96(%r13), %ymm12 // B
+ vmovaps 96(%rax), %ymm15 // A2
+
+
+ // unroll 3
+ addq $128, %r13
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ addq $128, %r11
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ addq $128, %r15
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ addq $128, %rax
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 0(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 0(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 0(%r13), %ymm12 // B
+ vmovaps 0(%rax), %ymm15 // A2
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 32(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 32(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 32(%r13), %ymm12 // B
+ vmovaps 32(%rax), %ymm15 // A2
+
+
+ // unroll 1
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 64(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 64(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 64(%r13), %ymm12 // B
+ vmovaps 64(%rax), %ymm15 // A2
+
+
+ // unroll 2
+ subl $4, %r10d
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vmovaps 96(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vmovaps 96(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ vbroadcastf128 96(%r13), %ymm12 // B
+ vmovaps 96(%rax), %ymm15 // A2
+
+
+ // unroll 3
+ addq $128, %r13
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ addq $128, %r11
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vpermilps $0x4e, %ymm12, %ymm12
+
+ addq $128, %r15
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ vpermilps $0xb1, %ymm12, %ymm12
+
+ addq $128, %rax
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+// vmovaps 0(%r11), %ymm13 // A0
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+// vmovaps 0(%r15), %ymm14 // A1
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+// vbroadcastf128 0(%r13), %ymm12 // B
+// vmovaps 0(%rax), %ymm15 // A2
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 32(%r13), %ymm12 // B
+ vmovaps 32(%r11), %ymm13 // A0
+ vmovaps 32(%r15), %ymm14 // A1
+ vmovaps 32(%rax), %ymm15 // A2
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ subl $1, %r10d
+
+ vpermilps $0xb1, %ymm12, %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ addq $32, %r11
+
+ vpermilps $0x4e, %ymm12, %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+ addq $32, %r13
+
+ vpermilps $0xb1, %ymm12, %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ vfmadd231ps %ymm15, %ymm12, %ymm11
+ addq $32, %r15
+
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_24x4_lib8, .-inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_24x4_lib8, @function
+inner_kernel_gemm_sub_nt_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_24x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_24x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+ vmovaps 0(%r11, %r12, 1), %ymm14 // A
+ vmovaps 0(%r11, %r12, 2), %ymm15 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ subl $4, %r10d
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd -32(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd -32(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd -32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 96(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ addq $128, %r13
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 0(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ subl $4, %r10d
+ vbroadcastss 4(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 8(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 12(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 32(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 32(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 32(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 40(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 44(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd 64(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd 64(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd 64(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 64(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ addq $128, %r11
+ vbroadcastss 68(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 72(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss 76(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vmovapd -32(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vmovapd -32(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ vmovapd -32(%r11, %r12, 2), %ymm15 // A
+
+ // unroll 0
+ vbroadcastss 96(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ addq $128, %r13
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss -28(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss -24(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ vbroadcastss -20(%r13), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+// vmovapd 0(%r11), %ymm13 // A
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+// vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+// vmovapd 0(%r11, %r12, 2), %ymm15 // A
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %ymm13 // a
+ vmovapd 0(%r11, %r12, 1), %ymm14 // A
+ vmovapd 0(%r11, %r12, 2), %ymm15 // A
+ vbroadcastss 0(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vfnmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 4(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vfnmadd231ps %ymm15, %ymm12, %ymm9
+ subl $1, %r10d
+ vbroadcastss 8(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vfnmadd231ps %ymm15, %ymm12, %ymm10
+ addq $32, %r11
+ vbroadcastss 12(%r13), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ vfnmadd231ps %ymm15, %ymm12, %ymm11
+ addq $32, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_24x4_lib8, .-inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- 4*sda*sizeof(double)
+// r13 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r14 <- 4*sdb*sizeof(double)
+// r15 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_24x4_lib8, @function
+inner_kernel_gemm_add_nn_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_24x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_24x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r13, %r14, 1) // software prefetch
+ prefetcht0 64(%r13, %r14, 1) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 0(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 0(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 32(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 64(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 96(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A0
+ vmovaps 32(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 32(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 4(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 36(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 68(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 100(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A0
+ vmovaps 64(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 64(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 8(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 40(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 72(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 104(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A0
+ vmovaps 96(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 96(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 12(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 44(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 76(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 108(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A0
+ vmovaps 128(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 128(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 16(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 48(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 80(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 112(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A0
+ vmovaps 160(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 160(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 20(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 52(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 84(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 116(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A0
+ vmovaps 192(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 192(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 24(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 56(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ subl $8, %r10d
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 88(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vbroadcastss 120(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+
+ // unroll 7
+ vmovaps 224(%r11), %ymm12 // A0
+ vmovaps 224(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 224(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 28(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 60(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ addq $256, %r11
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 92(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 124(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+ addq %r14, %r13
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 0(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 0(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 32(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 64(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 96(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r13
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_24x4_lib8, .-inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_24x4_lib8, @function
+inner_edge_gemm_add_nn_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_24x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_24x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r15d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %ebx
+ subl %r15d, %ebx // 8-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,8-offsetB)
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r13 // B+offsetB*sizeof(float)
+
+1:
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A0
+ vmovaps 0(%r11, %r12, 1), %ymm13 // A1
+ vmovaps 0(%r11, %r12, 2), %ymm14 // A2
+ vbroadcastss 0(%r13), %ymm15 // B[0]
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vfmadd231ps %ymm14, %ymm15, %ymm8
+ vbroadcastss 32(%r13), %ymm15 // B[1]
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vfmadd231ps %ymm14, %ymm15, %ymm9
+ vbroadcastss 64(%r13), %ymm15 // B[2]
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vfmadd231ps %ymm14, %ymm15, %ymm10
+ vbroadcastss 96(%r13), %ymm15 // B[3]
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vfmadd231ps %ymm14, %ymm15, %ymm11
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r13 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r14, %r13
+ subq $32, %r13 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_24x4_lib8, .-inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- bs*sda*sizeof(double)
+// r13 <- B-offB+bs*sdb*sizeof(double)
+// r14 <- bs*sdb*sizeof(double)
+// r15 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRMM_NN_RL_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trmm_nn_rl_24x4_lib8, @function
+inner_edge_trmm_nn_rl_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trmm_nn_rl_24x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_24x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ movl %r15d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ movq %r13, %rbx // B
+ addq %rax, %rbx // B+offsetB*sizeof(float)
+
+
+ cmpl $4, %r15d
+ jg 1f
+
+ // offB==0, 1, 2, 3, 4
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 8(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 40(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 72(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+ cmpl $5, %r15d
+ jg 1f
+
+ // offB==5
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 8(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 40(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 72(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movl $0, %r15d // offsetB=0
+
+ jmp 0f // end
+
+
+1:
+ cmpl $6, %r15d
+ jg 1f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 32(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 64(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+// cmpl $7, %r15d
+// jg 0f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r14, %r13 // B+8*sdb*sizeof(float)
+ movq %r13, %rbx // B
+ movl $0, %r15d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 0(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 32(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm13
+ vmovaps 0(%r11, %r12, 1), %ymm14
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vbroadcastss 4(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vfmadd231ps %ymm15, %ymm12, %ymm8
+ vbroadcastss 36(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vfmadd231ps %ymm15, %ymm12, %ymm9
+ vbroadcastss 68(%rbx), %ymm12
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vfmadd231ps %ymm15, %ymm12, %ymm10
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r15d // offsetB+1
+
+// jmp 0f // end
+
+
+ // end
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trmm_nn_rl_24x4_lib8, .-inner_edge_trmm_nn_rl_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_24x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_24x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_24x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_24x4_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ vmulps %ymm8, %ymm13, %ymm8
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastss 4(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vfnmadd231ps %ymm8, %ymm13, %ymm9
+ vbroadcastss 8(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vfnmadd231ps %ymm8, %ymm13, %ymm10
+ vbroadcastss 12(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+ vfnmadd231ps %ymm8, %ymm13, %ymm11
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ vmulps %ymm9, %ymm13, %ymm9
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastss 40(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vfnmadd231ps %ymm9, %ymm13, %ymm10
+ vbroadcastss 44(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+ vfnmadd231ps %ymm9, %ymm13, %ymm11
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ vmulps %ymm10, %ymm13, %ymm10
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastss 76(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+ vfnmadd231ps %ymm10, %ymm13, %ymm11
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+ vmulps %ymm11, %ymm13, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_24x4_vs_lib8, .-inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_24x4_vs_lib8, @function
+inner_edge_potrf_24x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_24x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_24x4_vs_lib8:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ vmulps %ymm8, %ymm13, %ymm8
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm15
+ vpermilps $0x55, %ymm15, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vfnmadd231ps %ymm8, %ymm13, %ymm9
+ vpermilps $0xaa, %ymm15, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vfnmadd231ps %ymm8, %ymm13, %ymm10
+ vpermilps $0xff, %ymm15, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+ vfnmadd231ps %ymm8, %ymm13, %ymm11
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ vmulps %ymm9, %ymm13, %ymm9
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm15
+ vpermilps $0xaa, %ymm15, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vfnmadd231ps %ymm9, %ymm13, %ymm10
+ vpermilps $0xff, %ymm15, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+ vfnmadd231ps %ymm9, %ymm13, %ymm11
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ vmulps %ymm10, %ymm13, %ymm10
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm15
+ vpermilps $0xff, %ymm15, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+ vfnmadd231ps %ymm10, %ymm13, %ymm11
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+ vmulps %ymm11, %ymm13, %ymm11
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_24x4_vs_lib8, .-inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_20x4_vs_lib8, @function
+inner_edge_potrf_20x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_20x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_20x4_vs_lib8:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vextractf128 $0x1, %ymm0, %xmm13
+// vpermilps $0x00, %xmm13, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vmulps %ymm4, %ymm13, %ymm4
+ vmulps %ymm8, %ymm13, %ymm8
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm15
+ vpermilps $0x55, %ymm15, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vfnmadd231ps %ymm8, %ymm13, %ymm9
+ vpermilps $0xaa, %ymm15, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vfnmadd231ps %ymm8, %ymm13, %ymm10
+ vpermilps $0xff, %ymm15, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+ vfnmadd231ps %ymm8, %ymm13, %ymm11
+
+
+ vextractf128 $0x1, %ymm1, %xmm13
+ vpermilps $0x55, %xmm13, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vmulps %ymm5, %ymm13, %ymm5
+ vmulps %ymm9, %ymm13, %ymm9
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm15
+ vpermilps $0xaa, %ymm15, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vfnmadd231ps %ymm9, %ymm13, %ymm10
+ vpermilps $0xff, %ymm15, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+ vfnmadd231ps %ymm9, %ymm13, %ymm11
+
+
+ vextractf128 $0x1, %ymm2, %xmm13
+ vpermilps $0xaa, %xmm13, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vmulps %ymm6, %ymm13, %ymm6
+ vmulps %ymm10, %ymm13, %ymm10
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm15
+ vpermilps $0xff, %ymm15, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+ vfnmadd231ps %ymm10, %ymm13, %ymm11
+
+
+ vextractf128 $0x1, %ymm3, %xmm13
+ vpermilps $0xff, %xmm13, %xmm13
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vmulps %ymm7, %ymm13, %ymm7
+ vmulps %ymm11, %ymm13, %ymm11
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_20x4_vs_lib8, .-inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_24x4_lib8, @function
+inner_scale_ab_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_24x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_24x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ vmulps %ymm4, %ymm15, %ymm4
+ vmulps %ymm5, %ymm15, %ymm5
+ vmulps %ymm6, %ymm15, %ymm6
+ vmulps %ymm7, %ymm15, %ymm7
+
+ vmulps %ymm8, %ymm15, %ymm8
+ vmulps %ymm9, %ymm15, %ymm9
+ vmulps %ymm10, %ymm15, %ymm10
+ vmulps %ymm11, %ymm15, %ymm11
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r12, %r13, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r12, %r13, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r12, %r13, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r12, %r13, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+ vmovaps 0(%r12, %r13, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm8
+ vmovaps 32(%r12, %r13, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm9
+ vmovaps 64(%r12, %r13, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm10
+ vmovaps 96(%r12, %r13, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_24x4_lib8, .-inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_24X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_24x4_gen_lib8, @function
+inner_scale_ab_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_24x4_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ vmulps %ymm4, %ymm15, %ymm4
+ vmulps %ymm5, %ymm15, %ymm5
+ vmulps %ymm6, %ymm15, %ymm6
+ vmulps %ymm7, %ymm15, %ymm7
+
+ vmulps %ymm8, %ymm15, %ymm8
+ vmulps %ymm9, %ymm15, %ymm9
+ vmulps %ymm10, %ymm15, %ymm10
+ vmulps %ymm11, %ymm15, %ymm11
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ movq %r13, %r15 // C1 <- C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+ movq %r15, %rax // C2 <- C1
+ addq %r14, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+ vmovaps 0(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm8
+ vmovaps 32(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm9
+ vmovaps 64(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm10
+ vmovaps 96(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm11
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r12d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r12d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r12d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_24x4_gen_lib8, .-inner_scale_ab_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_24x4_lib8, @function
+inner_scale_a0_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_24x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_24x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm12
+
+ vmulps %ymm0, %ymm12, %ymm0
+ vmulps %ymm1, %ymm12, %ymm1
+ vmulps %ymm2, %ymm12, %ymm2
+ vmulps %ymm3, %ymm12, %ymm3
+
+ vmulps %ymm4, %ymm12, %ymm4
+ vmulps %ymm5, %ymm12, %ymm5
+ vmulps %ymm6, %ymm12, %ymm6
+ vmulps %ymm7, %ymm12, %ymm7
+
+ vmulps %ymm8, %ymm12, %ymm8
+ vmulps %ymm9, %ymm12, %ymm9
+ vmulps %ymm10, %ymm12, %ymm10
+ vmulps %ymm11, %ymm12, %ymm11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_24x4_lib8, .-inner_scale_a0_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_24x4_lib8, @function
+inner_scale_11_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_24x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_24x4_lib8:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovaps .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovaps LC03(%rip), %ymm14
+#endif
+
+ vmovaps 0(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r10, %r11, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r10, %r11, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r10, %r11, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r10, %r11, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+ vmovaps 0(%r10, %r11, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm8
+ vmovaps 32(%r10, %r11, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm9
+ vmovaps 64(%r10, %r11, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm10
+ vmovaps 96(%r10, %r11, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_24x4_lib8, .-inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_24X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_24x4_gen_lib8, @function
+inner_scale_11_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_24x4_gen_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovaps .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovaps LC03(%rip), %ymm14
+#endif
+
+ vmovaps 0(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r11, %r12, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r11, %r12, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r11, %r12, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r11, %r12, 1), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+ vmovaps 0(%r11, %r12, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm8
+ vmovaps 32(%r11, %r12, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm9
+ vmovaps 64(%r11, %r12, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm10
+ vmovaps 96(%r11, %r12, 2), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm11
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_24x4_gen_lib8, .-inner_scale_11_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// r13 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_24x4_lib8, @function
+inner_blend_scale_ab_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_24x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_24x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm4, %ymm15, %ymm4
+ vmulps %ymm5, %ymm15, %ymm5
+ vmulps %ymm6, %ymm15, %ymm6
+ vmulps %ymm7, %ymm15, %ymm7
+
+ vblendps $0xaa, %ymm9, %ymm8, %ymm12
+ vblendps $0x55, %ymm9, %ymm8, %ymm13
+ vblendps $0xaa, %ymm11, %ymm10, %ymm14
+ vblendps $0x55, %ymm11, %ymm10, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm8
+ vblendps $0x33, %ymm15, %ymm12, %ymm10
+ vblendps $0xcc, %ymm14, %ymm13, %ymm9
+ vblendps $0x33, %ymm14, %ymm13, %ymm11
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm8, %ymm15, %ymm8
+ vmulps %ymm9, %ymm15, %ymm9
+ vmulps %ymm10, %ymm15, %ymm10
+ vmulps %ymm11, %ymm15, %ymm11
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ movq %r12, %r15 // C1 <- C0
+ addq %r13, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+ movq %r15, %rax // C2 <- C1
+ addq %r13, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+ vmovaps 0(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm8
+ vmovaps 32(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm9
+ vmovaps 64(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm10
+ vmovaps 96(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_24x4_lib8, .-inner_blend_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_24X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_24x4_gen_lib8, @function
+inner_blend_scale_ab_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_24x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm4, %ymm15, %ymm4
+ vmulps %ymm5, %ymm15, %ymm5
+ vmulps %ymm6, %ymm15, %ymm6
+ vmulps %ymm7, %ymm15, %ymm7
+
+ vblendps $0xaa, %ymm9, %ymm8, %ymm12
+ vblendps $0x55, %ymm9, %ymm8, %ymm13
+ vblendps $0xaa, %ymm11, %ymm10, %ymm14
+ vblendps $0x55, %ymm11, %ymm10, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm8
+ vblendps $0x33, %ymm15, %ymm12, %ymm10
+ vblendps $0xcc, %ymm14, %ymm13, %ymm9
+ vblendps $0x33, %ymm14, %ymm13, %ymm11
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm8, %ymm15, %ymm8
+ vmulps %ymm9, %ymm15, %ymm9
+ vmulps %ymm10, %ymm15, %ymm10
+ vmulps %ymm11, %ymm15, %ymm11
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ movq %r13, %r15 // C1 <- C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+ movq %r15, %rax // C2 <- C1
+ addq %r14, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+ vmovaps 0(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm8
+ vmovaps 32(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm9
+ vmovaps 64(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm10
+ vmovaps 96(%rax), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm11
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r14, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r12d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r12d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r12d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_24x4_gen_lib8, .-inner_blend_scale_ab_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// r11 <- 4*sdc*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_24x4_lib8, @function
+inner_blend_scale_11_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_24x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_24x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vblendps $0xaa, %ymm9, %ymm8, %ymm12
+ vblendps $0x55, %ymm9, %ymm8, %ymm13
+ vblendps $0xaa, %ymm11, %ymm10, %ymm14
+ vblendps $0x55, %ymm11, %ymm10, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm8
+ vblendps $0x33, %ymm15, %ymm12, %ymm10
+ vblendps $0xcc, %ymm14, %ymm13, %ymm9
+ vblendps $0x33, %ymm14, %ymm13, %ymm11
+
+ movq %r10, %r15 // C1 <- C0
+ addq %r11, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+ movq %r15, %rax // C2 <- C1
+ addq %r11, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ vmovaps 0(%rax), %ymm15
+ vaddps %ymm15, %ymm8, %ymm8
+ vmovaps 32(%rax), %ymm15
+ vaddps %ymm15, %ymm9, %ymm9
+ vmovaps 64(%rax), %ymm15
+ vaddps %ymm15, %ymm10, %ymm10
+ vmovaps 96(%rax), %ymm15
+ vaddps %ymm15, %ymm11, %ymm11
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_24x4_lib8, .-inner_blend_scale_11_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_24X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_24x4_gen_lib8, @function
+inner_blend_scale_11_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_24x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vblendps $0xaa, %ymm9, %ymm8, %ymm12
+ vblendps $0x55, %ymm9, %ymm8, %ymm13
+ vblendps $0xaa, %ymm11, %ymm10, %ymm14
+ vblendps $0x55, %ymm11, %ymm10, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm8
+ vblendps $0x33, %ymm15, %ymm12, %ymm10
+ vblendps $0xcc, %ymm14, %ymm13, %ymm9
+ vblendps $0x33, %ymm14, %ymm13, %ymm11
+
+ movq %r11, %r15 // C1 <- C0
+ addq %r12, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+ movq %r15, %rax // C2 <- C1
+ addq %r12, %rax // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vmovaps 32(%r11), %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vmovaps 64(%r11), %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vmovaps 96(%r11), %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ vmovaps 0(%r15), %ymm15
+ vaddps %ymm15, %ymm4, %ymm4
+ vmovaps 32(%r15), %ymm15
+ vaddps %ymm15, %ymm5, %ymm5
+ vmovaps 64(%r15), %ymm15
+ vaddps %ymm15, %ymm6, %ymm6
+ vmovaps 96(%r15), %ymm15
+ vaddps %ymm15, %ymm7, %ymm7
+
+ vmovaps 0(%rax), %ymm15
+ vaddps %ymm15, %ymm8, %ymm8
+ vmovaps 32(%rax), %ymm15
+ vaddps %ymm15, %ymm9, %ymm9
+ vmovaps 64(%rax), %ymm15
+ vaddps %ymm15, %ymm10, %ymm10
+ vmovaps 96(%rax), %ymm15
+ vaddps %ymm15, %ymm11, %ymm11
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %rax, %rbx // C1
+ addq %r12, %rbx // C2 <- C1 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_24x4_gen_lib8, .-inner_blend_scale_11_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_24x4_lib8, @function
+inner_store_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_24x4_lib8; .scl 2; .type 32; .endef
+inner_store_24x4_lib8:
+#endif
+#endif
+
+ movq %r10, %r15 // D1 <- D0
+ addq %r11, %r15 // D1 <- D0 + 4*sdd*sizeof(double)
+ movq %r15, %rax // D2 <- D1
+ addq %r11, %rax // D2 <- D1 + 4*sdd*sizeof(double)
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r15)
+ vmovaps %ymm5, 32(%r15)
+ vmovaps %ymm6, 64(%r15)
+ vmovaps %ymm7, 96(%r15)
+
+ vmovaps %ymm8, 0(%rax)
+ vmovaps %ymm9, 32(%rax)
+ vmovaps %ymm10, 64(%rax)
+ vmovaps %ymm11, 96(%rax)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_24x4_lib8, .-inner_store_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_24X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_24x4_vs_lib8, @function
+inner_store_24x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_24x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_24x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC02(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmaskmovps %ymm8, %ymm15, 0(%r10, %r11, 2)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmaskmovps %ymm9, %ymm15, 32(%r10, %r11, 2)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmaskmovps %ymm10, %ymm15, 64(%r10, %r11, 2)
+ je 0f // end
+ vmovaps %ymm3, 96(%r10)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+ vmaskmovps %ymm11, %ymm15, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_24x4_vs_lib8, .-inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_24X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_24x4_gen_lib8, @function
+inner_store_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_24x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC02(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm10, %ymm9
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ vmovaps %ymm11, %ymm10
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm10, %ymm9
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ addq $32, %r11
+
+0:
+
+ // compute D1
+ movq %r11, %rbx // D1
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(float)
+ movq %rbx, %rbp // D2
+ addq %r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ cmpl $2, %r15d
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmovaps %ymm4, 0(%rbx)
+ vmaskmovps %ymm8, %ymm15, 0(%rbp)
+ jl 7f // end
+ cmpl $3, %r15d
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmovaps %ymm5, 32(%rbx)
+ vmaskmovps %ymm9, %ymm15, 32(%rbp)
+ jl 7f // end
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmovaps %ymm6, 64(%rbx)
+ vmaskmovps %ymm10, %ymm15, 64(%rbp)
+ je 7f // end
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmovaps %ymm7, 96(%rbx)
+ vmaskmovps %ymm11, %ymm15, 96(%rbp)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+// movq %r11, %rbp // D1
+// addq %r12, %rbp // D2 <- D1 + 4*sdd*sizeof(float)
+ addq %rbp, %r12 // D3 <- D2 + 4*sdd*sizeof(float)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_24x4_gen_lib8, .-inner_store_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_20X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_20x4_lib8, @function
+inner_store_l_20x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_20x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_20x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_20x4_lib8:
+#endif
+#endif
+
+ vmovaps 0(%r10), %ymm12
+ vmovaps 32(%r10), %ymm13
+ vmovaps 64(%r10), %ymm14
+ vmovaps 96(%r10), %ymm15
+
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vblendps $0x1f, %ymm13, %ymm1, %ymm1
+ vblendps $0x3f, %ymm14, %ymm2, %ymm2
+ vblendps $0x7f, %ymm15, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+ vmovaps %ymm8, 0(%r10, %r11, 2)
+ vmovaps %ymm9, 32(%r10, %r11, 2)
+ vmovaps %ymm10, 64(%r10, %r11, 2)
+ vmovaps %ymm11, 96(%r10, %r11, 2)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_20x4_lib8, .-inner_store_l_20x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(float)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_24X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_24x4_lib8, @function
+inner_store_l_24x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_24x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_24x4_lib8:
+#endif
+#endif
+
+ vmovaps 32(%r10), %ymm12
+ vmovaps 64(%r10), %ymm13
+ vmovaps 96(%r10), %ymm14
+
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vblendps $0x03, %ymm13, %ymm2, %ymm2
+ vblendps $0x07, %ymm14, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+
+ vmovaps %ymm8, 0(%r10, %r11, 2)
+ vmovaps %ymm9, 32(%r10, %r11, 2)
+ vmovaps %ymm10, 64(%r10, %r11, 2)
+ vmovaps %ymm11, 96(%r10, %r11, 2)
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_24x4_lib8, .-inner_store_l_24x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_20X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_20x4_vs_lib8, @function
+inner_store_l_20x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_20x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_20x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC02(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps 0(%r10), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmaskmovps %ymm8, %ymm15, 0(%r10, %r11, 2)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmaskmovps %ymm9, %ymm15, 32(%r10, %r11, 2)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmaskmovps %ymm10, %ymm15, 64(%r10, %r11, 2)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+ vmaskmovps %ymm11, %ymm15, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_20x4_vs_lib8, .-inner_store_l_20x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- 4*sdd*sizeof(double)
+// r12 <- km
+// r13 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_24X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_24x4_vs_lib8, @function
+inner_store_l_24x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_24x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_24x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r12d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC02(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm13, %ymm15
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm4, 0(%r10, %r11, 1)
+ vmaskmovps %ymm8, %ymm15, 0(%r10, %r11, 2)
+ cmpl $2, %r13d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm5, 32(%r10, %r11, 1)
+ vmaskmovps %ymm9, %ymm15, 32(%r10, %r11, 2)
+ cmpl $3, %r13d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm6, 64(%r10, %r11, 1)
+ vmaskmovps %ymm10, %ymm15, 64(%r10, %r11, 2)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmovaps %ymm7, 96(%r10, %r11, 1)
+ vmaskmovps %ymm11, %ymm15, 96(%r10, %r11, 2)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_24x4_vs_lib8, .-inner_store_l_24x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_20X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_20x4_gen_lib8, @function
+inner_store_l_20x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_20x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_20x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_20x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC02(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm10, %ymm9
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ vmovaps %ymm11, %ymm10
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm10, %ymm9
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmovaps 0(%r11), %ymm12
+ vblendps $0x0f, %ymm12, %ymm0, %ymm0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmovaps %ymm4, 0(%r11, %r12, 1)
+ vmaskmovps %ymm8, %ymm15, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x1f, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmovaps %ymm5, 32(%r11, %r12, 1)
+ vmaskmovps %ymm9, %ymm15, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x3f, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmovaps %ymm6, 64(%r11, %r12, 1)
+ vmaskmovps %ymm10, %ymm15, 64(%r11, %r12, 2)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x7f, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmovaps %ymm7, 96(%r11, %r12, 1)
+ vmaskmovps %ymm11, %ymm15, 96(%r11, %r12, 2)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_20x4_gen_lib8, .-inner_store_l_20x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_24X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_24x4_gen_lib8, @function
+inner_store_l_24x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_24x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_24x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+ vmovups .LC02(%rip), %ymm13
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+ vmovups LC02(%rip), %ymm13
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm13, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm10, %ymm9
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm7, %ymm6
+ vmovaps %ymm11, %ymm10
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm10, %ymm9
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm9, %ymm8
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm14, 0(%r11)
+ vmovaps %ymm4, 0(%r11, %r12, 1)
+ vmaskmovps %ymm8, %ymm15, 0(%r11, %r12, 2)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm14, 32(%r11)
+ vmovaps %ymm5, 32(%r11, %r12, 1)
+ vmaskmovps %ymm9, %ymm15, 32(%r11, %r12, 2)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm14, 64(%r11)
+ vmovaps %ymm6, 64(%r11, %r12, 1)
+ vmaskmovps %ymm10, %ymm15, 64(%r11, %r12, 2)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm14, 96(%r11)
+ vmovaps %ymm7, 96(%r11, %r12, 1)
+ vmaskmovps %ymm11, %ymm15, 96(%r11, %r12, 2)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_24x4_gen_lib8, .-inner_store_l_24x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_nt_24x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_24x4_lib8
+ .type kernel_sgemm_nt_24x4_lib8, @function
+kernel_sgemm_nt_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_24x4_lib8
+_kernel_sgemm_nt_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_24x4_lib8
+ .def kernel_sgemm_nt_24x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_24x4_lib8, .-kernel_sgemm_nt_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_nt_24x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_24x4_vs_lib8
+ .type kernel_sgemm_nt_24x4_vs_lib8, @function
+kernel_sgemm_nt_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_24x4_vs_lib8
+_kernel_sgemm_nt_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_24x4_vs_lib8
+ .def kernel_sgemm_nt_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_24x4_vs_lib8, .-kernel_sgemm_nt_24x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80
+// void kernel_sgemm_nt_24x4_gen_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_24x4_gen_lib8
+ .type kernel_sgemm_nt_24x4_gen_lib8, @function
+kernel_sgemm_nt_24x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_24x4_gen_lib8
+_kernel_sgemm_nt_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_24x4_gen_lib8
+ .def kernel_sgemm_nt_24x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_24x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // offsetC
+ movq ARG8, %r13 // C
+ movq ARG9, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_24x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG10, %r10 // offsetD
+ movq ARG11, %r11 // D
+ movq ARG12, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG13, %r13 // m0
+ movq ARG14, %r14 // m1
+ movq ARG15, %r15 // n0
+ movq ARG16, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_24x4_gen_lib8, .-kernel_sgemm_nt_24x4_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_nn_24x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_24x4_lib8
+ .type kernel_sgemm_nn_24x4_lib8, @function
+kernel_sgemm_nn_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_24x4_lib8
+_kernel_sgemm_nn_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_24x4_lib8
+ .def kernel_sgemm_nn_24x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_24x4_lib8, .-kernel_sgemm_nn_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_nn_24x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_24x4_vs_lib8
+ .type kernel_sgemm_nn_24x4_vs_lib8, @function
+kernel_sgemm_nn_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_24x4_vs_lib8
+_kernel_sgemm_nn_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_24x4_vs_lib8
+ .def kernel_sgemm_nn_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // C
+ movq ARG10, %r13 // sdc
+ sall $5, %r13d // 4*sdb*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG13, %r12 // km
+ movq ARG14, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_24x4_vs_lib8, .-kernel_sgemm_nn_24x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88 rsp+96
+// void kernel_sgemm_nn_24x4_gen_lib4(int k, float *alpha, float *A, int sda, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_24x4_gen_lib8
+ .type kernel_sgemm_nn_24x4_gen_lib8, @function
+kernel_sgemm_nn_24x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_24x4_gen_lib8
+_kernel_sgemm_nn_24x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_24x4_gen_lib8
+ .def kernel_sgemm_nn_24x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_24x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG8, %r11 // beta
+ movq ARG9, %r12 // offsetC
+ movq ARG10, %r13 // C
+ movq ARG11, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_24x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG12, %r10 // offsetD
+ movq ARG13, %r11 // D
+ movq ARG14, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG15, %r13 // m0
+ movq ARG16, %r14 // m1
+ movq ARG17, %r15 // n0
+ movq ARG18, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_24x4_gen_lib8, .-kernel_sgemm_nn_24x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_strsm_nt_rl_inv_24x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_24x4_lib8
+ .type kernel_strsm_nt_rl_inv_24x4_lib8, @function
+kernel_strsm_nt_rl_inv_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_24x4_lib8
+_kernel_strsm_nt_rl_inv_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_24x4_lib8
+ .def kernel_strsm_nt_rl_inv_24x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_24x4_lib8, .-kernel_strsm_nt_rl_inv_24x4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_strsm_nt_rl_inv_24x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_24x4_vs_lib8
+ .type kernel_strsm_nt_rl_inv_24x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_24x4_vs_lib8
+_kernel_strsm_nt_rl_inv_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_24x4_vs_lib8
+ .def kernel_strsm_nt_rl_inv_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG4, %r13
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG11, %r12 // m1
+ movq ARG12, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_24x4_vs_lib8, .-kernel_strsm_nt_rl_inv_24x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14
+// void kernel_sgemm_strsm_nt_rl_inv_24x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_24x4_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_24x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_24x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_24x4_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_24x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movl $4, %r12d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_24x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+// void kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sda*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG13, %r10 // E
+ movq ARG14, %r11 // inv_diag_E
+ movq ARG16, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG15, %r12 // km
+ movq ARG16, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_24x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_20x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_20x4_lib8
+ .type kernel_spotrf_nt_l_20x4_lib8, @function
+kernel_spotrf_nt_l_20x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_20x4_lib8
+_kernel_spotrf_nt_l_20x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_20x4_lib8
+ .def kernel_spotrf_nt_l_20x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_20x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_20X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_20x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_20x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_20x4_lib8, .-kernel_spotrf_nt_l_20x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_20x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_20x4_vs_lib8
+ .type kernel_spotrf_nt_l_20x4_vs_lib8, @function
+kernel_spotrf_nt_l_20x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_20x4_vs_lib8
+_kernel_spotrf_nt_l_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_20x4_vs_lib8
+ .def kernel_spotrf_nt_l_20x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_20x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_20x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_20x4_lib8, .-kernel_spotrf_nt_l_20x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_spotrf_nt_l_24x4_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_24x4_lib8
+ .type kernel_spotrf_nt_l_24x4_lib8, @function
+kernel_spotrf_nt_l_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_24x4_lib8
+_kernel_spotrf_nt_l_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_24x4_lib8
+ .def kernel_spotrf_nt_l_24x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_24x4_lib8, .-kernel_spotrf_nt_l_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_spotrf_nt_l_24x4_vs_lib8(int k, float *A, int sda, float *B, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_24x4_vs_lib8
+ .type kernel_spotrf_nt_l_24x4_vs_lib8, @function
+kernel_spotrf_nt_l_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_24x4_vs_lib8
+_kernel_spotrf_nt_l_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_24x4_vs_lib8
+ .def kernel_spotrf_nt_l_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG2, %r11 // A
+ movq ARG3, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG4, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG5, %r10 // C
+ movq ARG6, %r11 // sdc
+ sall $5, %r11d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // m1
+ movq ARG11, %r13 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_24x4_lib8, .-kernel_spotrf_nt_l_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_20x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_20x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_20x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_20x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_20x4_lib8
+_kernel_ssyrk_spotrf_nt_l_20x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_20x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_20x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_20x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_20X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_20x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_20x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_20x4_lib8, .-kernel_ssyrk_spotrf_nt_l_20x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_20x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_20x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_20x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_ssyrk_spotrf_nt_l_24x4_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_24x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_24x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_24x4_lib8
+_kernel_ssyrk_spotrf_nt_l_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_24x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_24x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_24x4_lib8, .-kernel_ssyrk_spotrf_nt_l_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8(int kp, float *Ap, int sdap, float *Bp, int km, float *Am, int sdam, float *Bm, float *C, int sdc, float *D, int sdd, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+ vmovapd %ymm0, %ymm4
+ vmovapd %ymm0, %ymm5
+ vmovapd %ymm0, %ymm6
+ vmovapd %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // sdap
+ sall $5, %r12d // 4*sdap*sizeof(double)
+ movq ARG4, %r13 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG5, %r10 // km
+ movq ARG6, %r11 // Am
+ movq ARG7, %r12 // sdam
+ sall $5, %r12d // 4*sdam*sizeof(double)
+ movq ARG8, %r13 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner blender nn
+
+ movq ARG9, %r10 // C
+ movq ARG10, %r11 // sdc
+ sall $5, %r11d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_24x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG13, %r10 // inv_diag_D
+ movq ARG15, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_24x4_vs_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG11, %r10 // store address D
+ movq ARG12, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+ movq ARG14, %r12 // km
+ movq ARG15, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_24x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_24x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_24x4_lib8
+ .type kernel_ssyrk_nt_l_24x4_lib8, @function
+kernel_ssyrk_nt_l_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_24x4_lib8
+_kernel_ssyrk_nt_l_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_24x4_lib8
+ .def kernel_ssyrk_nt_l_24x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_24x4_lib8, .-kernel_ssyrk_nt_l_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_ssyrk_nt_l_24x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_24x4_vs_lib8
+ .type kernel_ssyrk_nt_l_24x4_vs_lib8, @function
+kernel_ssyrk_nt_l_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_24x4_vs_lib8
+_kernel_ssyrk_nt_l_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_24x4_vs_lib8
+ .def kernel_ssyrk_nt_l_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_24x4_vs_lib8, .-kernel_ssyrk_nt_l_24x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_ssyrk_nt_l_20x4_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_20x4_lib8
+ .type kernel_ssyrk_nt_l_20x4_lib8, @function
+kernel_ssyrk_nt_l_20x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_20x4_lib8
+_kernel_ssyrk_nt_l_20x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_20x4_lib8
+ .def kernel_ssyrk_nt_l_20x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_20x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_20X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_20x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_20x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_20x4_lib8, .-kernel_ssyrk_nt_l_20x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_ssyrk_nt_l_20x4_vs_lib8(int k, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_20x4_vs_lib8
+ .type kernel_ssyrk_nt_l_20x4_vs_lib8, @function
+kernel_ssyrk_nt_l_20x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_20x4_vs_lib8
+_kernel_ssyrk_nt_l_20x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_20x4_vs_lib8
+ .def kernel_ssyrk_nt_l_20x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_20x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sda
+ sall $5, %r12d // 8*sda*sizeof(float)
+ movq ARG5, %r13 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_kernel_gemm_add_nt_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq %rsi, %r10 // alpha
+ movq ARG6, %r11 // beta
+ movq ARG7, %r12 // C
+ movl ARG8, %r13d // sdc
+ sall $5, %r13d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_24X4_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_scale_ab_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movl ARG10, %r11d // sdd
+ sall $5, %r11d // 8*sdd*sizeof(float)
+ movq ARG11, %r12 // km
+ movq ARG12, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_20X4_VS_LIB8
+#else
+#if defined(OS_LINUX)
+ call inner_store_l_20x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_20x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_20x4_vs_lib8, .-kernel_ssyrk_nt_l_20x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strmm_nn_rl_24x4_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_24x4_lib8
+ .type kernel_strmm_nn_rl_24x4_lib8, @function
+kernel_strmm_nn_rl_24x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_24x4_lib8
+_kernel_strmm_nn_rl_24x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_24x4_lib8
+ .def kernel_strmm_nn_rl_24x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_24x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_24x4_lib8, .-kernel_strmm_nn_rl_24x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_strmm_nn_rl_24x4_vs_lib8(int k, float *alpha, float *A, int sda, int offsetB, float *B, int sdb, float *D, int sdd, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_24x4_vs_lib8
+ .type kernel_strmm_nn_rl_24x4_vs_lib8, @function
+kernel_strmm_nn_rl_24x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_24x4_vs_lib8
+_kernel_strmm_nn_rl_24x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_24x4_vs_lib8
+ .def kernel_strmm_nn_rl_24x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_24x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+ vmovaps %ymm0, %ymm8
+ vmovaps %ymm0, %ymm9
+ vmovaps %ymm0, %ymm10
+ vmovaps %ymm0, %ymm11
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // sdb
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG6, %r13 // B
+ movq ARG7, %r14 // sdb
+ sall $5, %r14d // 4*sdb*sizeof(double)
+ movq ARG5, %r15 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_24x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_24x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_24X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_24x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_24x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG8, %r10 // D
+ movq ARG9, %r11 // sdd
+ sall $5, %r11d // 4*sdd*sizeof(double)
+ movq ARG10, %r12 // km
+ movq ARG11, %r13 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_24X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_24x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_24x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_24x4_vs_lib8, .-kernel_strmm_nn_rl_24x4_vs_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#endif
+
+#if defined(OS_LINUX)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC04: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_sgemm_8x4_lib8.S b/kernel/avx2/kernel_sgemm_8x4_lib8.S
new file mode 100644
index 0000000..44946f1
--- /dev/null
+++ b/kernel/avx2/kernel_sgemm_8x4_lib8.S
@@ -0,0 +1,7342 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_8x4_lib8, @function
+inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x4_lib8:
+#endif
+#endif
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+
+ vxorps %ymm4, %ymm4, %ymm4
+ vmovaps %ymm4, %ymm5
+ vmovaps %ymm4, %ymm6
+ vmovaps %ymm4, %ymm7
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastss 0(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm14 // A
+ vbroadcastss 4(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 8(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 12(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovaps 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss 40(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 44(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps -32(%r11), %ymm14 // A
+ vbroadcastss 68(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 72(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 76(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastss -32(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovaps 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss -24(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss -20(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm14 // a
+ vbroadcastss 4(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 8(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 12(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+ vmovaps 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss 40(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 44(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps -32(%r11), %ymm14 // A
+ vbroadcastss 68(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 72(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 76(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastss -32(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm4
+// vmovaps 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss -24(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss -20(%r12), %ymm12 // B
+ vfmadd231ps %ymm14, %ymm12, %ymm7
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm13 // a
+ vbroadcastss 0(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 4(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ subl $1, %r10d
+ vbroadcastss 8(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ addq $32, %r11
+ vbroadcastss 12(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddps %ymm4, %ymm0, %ymm0
+ vaddps %ymm5, %ymm1, %ymm1
+ vaddps %ymm6, %ymm2, %ymm2
+ vaddps %ymm7, %ymm3, %ymm3
+
+5: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ // preload
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ vxorps %ymm4, %ymm4, %ymm4
+ vmovaps %ymm4, %ymm5
+ vmovaps %ymm4, %ymm6
+ vmovaps %ymm4, %ymm7
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 128(%r12), %ymm14 // B
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vbroadcastf128 32(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 64(%r12), %ymm14 // B
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vbroadcastf128 96(%r12), %ymm15 // B
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+// vbroadcastf128 128(%r12), %ymm14 // B
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+// vbroadcastf128 32(%r12), %ymm15 // B
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm0, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm1, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm2, %ymm2
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vmulps %ymm12, %ymm14, %ymm11
+ vaddps %ymm11, %ymm3, %ymm3
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddps %ymm4, %ymm0, %ymm0
+ vaddps %ymm5, %ymm1, %ymm1
+ vaddps %ymm6, %ymm2, %ymm2
+ vaddps %ymm7, %ymm3, %ymm3
+
+5: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_8x4_lib8, .-inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30 d40 d50 d60 d70]
+// ymm1 <- [d01 d11 d21 d31 d41 d51 d61 d71]
+// ymm2 <- [d02 d12 d22 d32 d42 d52 d62 d72]
+// ymm3 <- [d03 d13 d23 d33 d43 d53 d63 d73]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_8x4_lib8, @function
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 5f // return
+
+ // preload
+ vmovaps 0(%r11), %ymm13 // A
+
+ vxorps %ymm4, %ymm4, %ymm4
+ vmovaps %ymm4, %ymm5
+ vmovaps %ymm4, %ymm6
+ vmovaps %ymm4, %ymm7
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vbroadcastss 0(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm14 // A
+ vbroadcastss 4(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 8(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 12(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovaps 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss 40(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 44(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps -32(%r11), %ymm14 // A
+ vbroadcastss 68(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 72(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 76(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastss -32(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovaps 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss -24(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss -20(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vbroadcastss 0(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm14 // a
+ vbroadcastss 4(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 8(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 12(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ subl $4, %r10d
+
+ // unroll 0
+ vbroadcastss 32(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+ vmovaps 64(%r11), %ymm13 // A
+ vbroadcastss 36(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss 40(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss 44(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vbroadcastss 64(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vmovaps -32(%r11), %ymm14 // A
+ vbroadcastss 68(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 72(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 76(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ addq $128, %r12
+
+ // unroll 0
+ vbroadcastss -32(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm4
+// vmovaps 0(%r11), %ymm13 // A
+ vbroadcastss -28(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm5
+ vbroadcastss -24(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm6
+ vbroadcastss -20(%r12), %ymm12 // B
+ vfnmadd231ps %ymm14, %ymm12, %ymm7
+
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm13 // a
+ vbroadcastss 0(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 4(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ subl $1, %r10d
+ vbroadcastss 8(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ addq $32, %r11
+ vbroadcastss 12(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // reduce
+
+ vaddps %ymm4, %ymm0, %ymm0
+ vaddps %ymm5, %ymm1, %ymm1
+ vaddps %ymm6, %ymm2, %ymm2
+ vaddps %ymm7, %ymm3, %ymm3
+
+5: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_8x4_lib8, .-inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_8x4_lib8, @function
+inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ vxorps %ymm4, %ymm4, %ymm4
+ vmovaps %ymm4, %ymm5
+ vmovaps %ymm4, %ymm6
+ vmovaps %ymm4, %ymm7
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+// prefetcht0 0(%r12, %r13, 1) // software prefetch
+// prefetcht0 64(%r12, %r13, 1) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+ subl $8, %r10d
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ addq $256, %r11
+
+ // unroll 7
+ vmovaps -32(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+ addq %r12, %r13
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+ vaddps %ymm4, %ymm0, %ymm0
+ vaddps %ymm5, %ymm1, %ymm1
+ vaddps %ymm6, %ymm2, %ymm2
+ vaddps %ymm7, %ymm3, %ymm3
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_8x4_lib8, .-inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nn_8x4_lib8, @function
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ vxorps %ymm4, %ymm4, %ymm4
+ vmovaps %ymm4, %ymm5
+ vmovaps %ymm4, %ymm6
+ vmovaps %ymm4, %ymm7
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 1) // software prefetch
+ prefetcht0 64(%r12, %r13, 1) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm3
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm3
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm3
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm7
+ subl $8, %r10d
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm3
+ addq $256, %r11
+
+ // unroll 7
+ vmovaps -32(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm7
+ addq %r12, %r13
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+ vaddps %ymm4, %ymm0, %ymm0
+ vaddps %ymm5, %ymm1, %ymm1
+ vaddps %ymm6, %ymm2, %ymm2
+ vaddps %ymm7, %ymm3, %ymm3
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfnmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfnmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfnmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfnmadd231ps %ymm12, %ymm13, %ymm3
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nn_8x4_lib8, .-inner_kernel_gemm_sub_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_8x4_lib8, @function
+inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %r15d
+ subl %r14d, %r15d // 8-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,8-offsetB)
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r12 // B+offsetB*sizeof(float)
+
+1:
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+ vbroadcastss 96(%r12), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm3, %ymm3
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_8x4_lib8, .-inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trmm_nn_rl_8x4_lib8, @function
+inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trmm_nn_rl_8x4_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ movq %r12, %rbx // B
+ addq %rax, %rbx // B+offsetB*sizeof(float)
+
+
+ cmpl $4, %r14d
+ jg 1f
+
+ // offB==0, 1, 2, 3, 4
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+ cmpl $5, %r14d
+ jg 1f
+
+ // offB==5
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 8(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 40(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 72(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movl $0, %r14d // offsetB=0
+
+ jmp 0f // end
+
+
+1:
+ cmpl $6, %r14d
+ jg 1f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movq %r12, %rbx // B
+ movl $0, %r14d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 64(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ jmp 0f // end
+
+
+1:
+// cmpl $7, %r14d
+// jg 0f
+
+ // offB==6
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq %r13, %r12 // B+8*sdb*sizeof(float)
+ movq %r12, %rbx // B
+ movl $0, %r14d // offsetB=0
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 0(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 32(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+ cmpl $0, %r10d
+ jle 0f // end
+
+ vmovaps 0(%r11), %ymm8
+ vbroadcastss 4(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm0, %ymm0
+ vbroadcastss 36(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm1, %ymm1
+ vbroadcastss 68(%rbx), %ymm12
+ vmulps %ymm8, %ymm12, %ymm15
+ vaddps %ymm15, %ymm2, %ymm2
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addl $1, %r14d // offsetB+1
+
+// jmp 0f // end
+
+
+ // end
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trmm_nn_rl_8x4_lib8, .-inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x4_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x4_lib8, .-inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x4_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x4_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ cmpl $2, %r12d
+ jl 0f // ret
+ vbroadcastss 4(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ cmpl $3, %r12d
+ jl 0f // ret
+ vbroadcastss 40(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ cmpl $4, %r12d
+ jl 0f // ret
+ vbroadcastss 76(%r10), %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x4_vs_lib8, .-inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_4X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_4x8_vs_lib8, @function
+inner_edge_trsm_rlt_inv_4x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_4x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_4x8_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %xmm13
+ vmulps %xmm0, %xmm13, %xmm0
+ vbroadcastss 4(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm1
+ vbroadcastss 8(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm2
+ vbroadcastss 12(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm3
+ vbroadcastss 16(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm4
+ vbroadcastss 20(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm5
+ vbroadcastss 24(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm6
+ vbroadcastss 28(%r10), %xmm13
+ vfnmadd231ps %xmm0, %xmm13, %xmm7
+
+ vbroadcastss 4(%r11), %xmm13
+ vmulps %xmm1, %xmm13, %xmm1
+ vbroadcastss 40(%r10), %xmm13
+ vfnmadd231ps %xmm1, %xmm13, %xmm2
+ vbroadcastss 44(%r10), %xmm13
+ vfnmadd231ps %xmm1, %xmm13, %xmm3
+ vbroadcastss 48(%r10), %xmm13
+ vfnmadd231ps %xmm1, %xmm13, %xmm4
+ vbroadcastss 52(%r10), %xmm13
+ vfnmadd231ps %xmm1, %xmm13, %xmm5
+ vbroadcastss 56(%r10), %xmm13
+ vfnmadd231ps %xmm1, %xmm13, %xmm6
+ vbroadcastss 60(%r10), %xmm13
+ vfnmadd231ps %xmm1, %xmm13, %xmm7
+
+ vbroadcastss 8(%r11), %xmm13
+ vmulps %xmm2, %xmm13, %xmm2
+ vbroadcastss 76(%r10), %xmm13
+ vfnmadd231ps %xmm2, %xmm13, %xmm3
+ vbroadcastss 80(%r10), %xmm13
+ vfnmadd231ps %xmm2, %xmm13, %xmm4
+ vbroadcastss 84(%r10), %xmm13
+ vfnmadd231ps %xmm2, %xmm13, %xmm5
+ vbroadcastss 88(%r10), %xmm13
+ vfnmadd231ps %xmm2, %xmm13, %xmm6
+ vbroadcastss 92(%r10), %xmm13
+ vfnmadd231ps %xmm2, %xmm13, %xmm7
+
+ vbroadcastss 12(%r11), %xmm13
+ vmulps %xmm3, %xmm13, %xmm3
+ vbroadcastss 112(%r10), %xmm13
+ vfnmadd231ps %xmm3, %xmm13, %xmm4
+ vbroadcastss 116(%r10), %xmm13
+ vfnmadd231ps %xmm3, %xmm13, %xmm5
+ vbroadcastss 120(%r10), %xmm13
+ vfnmadd231ps %xmm3, %xmm13, %xmm6
+ vbroadcastss 124(%r10), %xmm13
+ vfnmadd231ps %xmm3, %xmm13, %xmm7
+
+ vbroadcastss 16(%r11), %xmm13
+ vmulps %xmm4, %xmm13, %xmm4
+ cmpl $6, %r12d
+ jl 0f // ret
+ vbroadcastss 148(%r10), %xmm13
+ vfnmadd231ps %xmm4, %xmm13, %xmm5
+ vbroadcastss 152(%r10), %xmm13
+ vfnmadd231ps %xmm4, %xmm13, %xmm6
+ vbroadcastss 156(%r10), %xmm13
+ vfnmadd231ps %xmm4, %xmm13, %xmm7
+
+ vbroadcastss 20(%r11), %xmm13
+ vmulps %xmm5, %xmm13, %xmm5
+ cmpl $7, %r12d
+ jl 0f // ret
+ vbroadcastss 184(%r10), %xmm13
+ vfnmadd231ps %xmm5, %xmm13, %xmm6
+ vbroadcastss 188(%r10), %xmm13
+ vfnmadd231ps %xmm5, %xmm13, %xmm7
+
+ vbroadcastss 24(%r11), %xmm13
+ vmulps %xmm6, %xmm13, %xmm6
+ cmpl $8, %r12d
+ jl 0f // ret
+ vbroadcastss 220(%r10), %xmm13
+ vfnmadd231ps %xmm6, %xmm13, %xmm7
+
+ vbroadcastss 28(%r11), %xmm13
+ vmulps %xmm7, %xmm13, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_4x8_vs_lib8, .-inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x4_lib8, @function
+inner_edge_potrf_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x4_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x4_lib8, .-inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization gen
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x4_vs_lib8, @function
+inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x4_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ cmpl $2, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm1, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm0, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ cmpl $3, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm2, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm1, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ cmpl $4, %r11d
+ jl 0f // ret
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vmulps %ymm2, %ymm13, %ymm12
+ vsubps %ymm12, %ymm3, %ymm3
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 12(%r10)
+ vpermilps $0x00, %xmm13, %xmm13
+ vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x4_vs_lib8, .-inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_lib8, @function
+inner_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_lib8, .-inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_lib8, @function
+inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // transpose
+ vunpcklps %ymm1, %ymm0, %ymm5
+ vunpckhps %ymm1, %ymm0, %ymm4
+ vunpcklps %ymm3, %ymm2, %ymm7
+ vunpckhps %ymm3, %ymm2, %ymm6
+
+ vunpcklpd %ymm7, %ymm5, %ymm0
+ vunpckhpd %ymm7, %ymm5, %ymm1
+ vunpcklpd %ymm6, %ymm4, %ymm2
+ vunpckhpd %ymm6, %ymm4, %ymm3
+
+ vextractf128 $0x1, %ymm0, %xmm4
+ vextractf128 $0x1, %ymm1, %xmm5
+ vextractf128 $0x1, %ymm2, %xmm6
+ vextractf128 $0x1, %ymm3, %xmm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm0
+ vmovaps 32(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm1
+ vmovaps 64(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm2
+ vmovaps 96(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm3
+ vmovaps 128(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm4
+ vmovaps 160(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm5
+ vmovaps 192(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm6
+ vmovaps 224(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_lib8, .-inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x4_gen_lib8, @function
+inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r13), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x4_gen_lib8, .-inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_ab_4x8_gen_lib8, @function
+inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_ab_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_ab_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_ab_4x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // transpose
+ vunpcklps %ymm1, %ymm0, %ymm5
+ vunpckhps %ymm1, %ymm0, %ymm4
+ vunpcklps %ymm3, %ymm2, %ymm7
+ vunpckhps %ymm3, %ymm2, %ymm6
+
+ vunpcklpd %ymm7, %ymm5, %ymm0
+ vunpckhpd %ymm7, %ymm5, %ymm1
+ vunpcklpd %ymm6, %ymm4, %ymm2
+ vunpckhpd %ymm6, %ymm4, %ymm3
+
+ vextractf128 $0x1, %ymm0, %xmm4
+ vextractf128 $0x1, %ymm1, %xmm5
+ vextractf128 $0x1, %ymm2, %xmm6
+ vextractf128 $0x1, %ymm3, %xmm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm0
+ vmovaps 32(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm1
+ vmovaps 64(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm2
+ vmovaps 96(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm3
+ vmovaps 128(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm4
+ vmovaps 160(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm5
+ vmovaps 192(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm6
+ vmovaps 224(%r12), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_ab_4x8_gen_lib8, .-inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0
+//
+// input arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_8x4_lib8, @function
+inner_scale_a0_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_a0_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_a0_8x4_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_8x4_lib8, .-inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x4_lib8, @function
+inner_scale_11_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x4_lib8; .scl 2; .type 32; .endef
+inner_scale_11_8x4_lib8:
+#endif
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovaps .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovaps LC03(%rip), %ymm14
+#endif
+
+ vmovaps 0(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x4_lib8, .-inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_TRAN_SCALE_11_4X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_tran_scale_11_4x8_lib8, @function
+inner_tran_scale_11_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_tran_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_tran_scale_11_4x8_lib8; .scl 2; .type 32; .endef
+inner_tran_scale_11_4x8_lib8:
+#endif
+#endif
+
+ // transpose
+ vunpcklps %ymm1, %ymm0, %ymm5
+ vunpckhps %ymm1, %ymm0, %ymm4
+ vunpcklps %ymm3, %ymm2, %ymm7
+ vunpckhps %ymm3, %ymm2, %ymm6
+
+ vunpcklpd %ymm7, %ymm5, %ymm0
+ vunpckhpd %ymm7, %ymm5, %ymm1
+ vunpcklpd %ymm6, %ymm4, %ymm2
+ vunpckhpd %ymm6, %ymm4, %ymm3
+
+ vextractf128 $0x1, %ymm0, %xmm4
+ vextractf128 $0x1, %ymm1, %xmm5
+ vextractf128 $0x1, %ymm2, %xmm6
+ vextractf128 $0x1, %ymm3, %xmm7
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovaps .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovaps LC03(%rip), %ymm14
+#endif
+
+ vmovaps 0(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm0
+ vmovaps 32(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm1
+ vmovaps 64(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm2
+ vmovaps 96(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm3
+ vmovaps 128(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm4
+ vmovaps 160(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm5
+ vmovaps 192(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm6
+ vmovaps 224(%r10), %xmm15
+ vfmadd231ps %xmm15, %xmm14, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_tran_scale_11_4x8_lib8, .-inner_tran_scale_11_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x4_gen_lib8, @function
+inner_scale_11_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_8x4_gen_lib8:
+#endif
+#endif
+
+
+ // offset==0
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovaps .LC03(%rip), %ymm14
+#elif defined(OS_MAC)
+ vmovaps LC03(%rip), %ymm14
+#endif
+
+ vmovaps 0(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r11), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x4_gen_lib8, .-inner_scale_11_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_lib8, @function
+inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vmulps %ymm15, %ymm14, %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_lib8, .-inner_blend_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x4_gen_lib8, @function
+inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm15
+
+ vmulps %ymm0, %ymm15, %ymm0
+ vmulps %ymm1, %ymm15, %ymm1
+ vmulps %ymm2, %ymm15, %ymm2
+ vmulps %ymm3, %ymm15, %ymm3
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vmulps %ymm12, %ymm15, %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x4_gen_lib8, .-inner_blend_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_lib8, @function
+inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_lib8, .-inner_blend_scale_11_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33]
+// ymm1 <- [d01 d10 d23 d32]
+// ymm2 <- [d03 d12 d21 d30]
+// ymm3 <- [d02 d13 d20 d31]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- [d00 d10 d20 d30]
+// ymm1 <- [d01 d11 d21 d31]
+// ymm2 <- [d02 d12 d22 d32]
+// ymm3 <- [d03 d13 d23 d33]
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x4_gen_lib8, @function
+inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x4_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm8 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm9 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm10
+ vblendps $0x55, %ymm3, %ymm2, %ymm11
+
+ vblendps $0xcc, %ymm11, %ymm8, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm11, %ymm8, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm10, %ymm9, %ymm1
+ vblendps $0x33, %ymm10, %ymm9, %ymm3
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %r15 // C0
+ addq %r12, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x4_gen_lib8, .-inner_blend_scale_11_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_lib8, @function
+inner_store_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_lib8, .-inner_store_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_lib8, @function
+inner_store_4x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_lib8:
+#endif
+#endif
+
+ vmovaps %xmm0, 0(%r10)
+ vmovaps %xmm1, 32(%r10)
+ vmovaps %xmm2, 64(%r10)
+ vmovaps %xmm3, 96(%r10)
+ vmovaps %xmm4, 128(%r10)
+ vmovaps %xmm5, 160(%r10)
+ vmovaps %xmm6, 192(%r10)
+ vmovaps %xmm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_lib8, .-inner_store_4x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_vs_lib8, @function
+inner_store_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %ymm14, %ymm12, %ymm14
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm14, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmaskmovps %ymm1, %ymm14, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmaskmovps %ymm2, %ymm14, 64(%r10)
+ je 0f // end
+ vmaskmovps %ymm3, %ymm14, 96(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_vs_lib8, .-inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_vs_lib8, @function
+inner_store_4x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm14, %xmm14
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vsubps %xmm14, %xmm12, %xmm14
+
+ // offset==0
+ vmaskmovps %xmm0, %xmm14, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmaskmovps %xmm1, %xmm14, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmaskmovps %xmm2, %xmm14, 64(%r10)
+ cmpl $4, %r12d
+ jl 0f // end
+ vmaskmovps %xmm3, %xmm14, 96(%r10)
+ cmpl $5, %r12d
+ jl 0f // end
+ vmaskmovps %xmm4, %xmm14, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovps %xmm5, %xmm14, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovps %xmm6, %xmm14, 192(%r10)
+ je 0f // end
+ vmaskmovps %xmm7, %xmm14, 224(%r10)
+ //
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x8_vs_lib8, .-inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x4_gen_lib8, @function
+inner_store_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ je 7f // end
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x8_gen_lib8, @function
+inner_store_4x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_4x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %xmm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %xmm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %xmm12, %xmm14, %xmm14
+ vsubps %xmm15, %xmm12, %xmm15
+ vandps %xmm14, %xmm15, %xmm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ vmovaps %xmm6, %xmm5
+ vmovaps %xmm7, %xmm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ vmovaps %xmm6, %xmm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ vmovaps %xmm5, %xmm4
+ addq $32, %r11
+
+ cmpl $3, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ vmovaps %xmm4, %xmm3
+ addq $32, %r11
+
+ cmpl $4, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ vmovaps %xmm3, %xmm2
+ addq $32, %r11
+
+ cmpl $5, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ vmovaps %xmm2, %xmm1
+ addq $32, %r11
+
+ cmpl $6, %r15d
+ jle 0f
+
+ vmovaps %xmm1, %xmm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %xmm0, %xmm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmaskmovps %xmm1, %xmm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmaskmovps %xmm2, %xmm15, 64(%r11)
+ cmpl $4, %r15d
+ jl 7f // end
+ vmaskmovps %xmm3, %xmm15, 96(%r11)
+ cmpl $5, %r15d
+ jl 7f // end
+ vmaskmovps %xmm4, %xmm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmaskmovps %xmm5, %xmm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmaskmovps %xmm6, %xmm15, 192(%r11)
+ je 7f // end
+ vmaskmovps %xmm7, %xmm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x4_gen_lib8, .-inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_lib8, @function
+inner_store_l_8x4_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_lib8:
+#endif
+#endif
+
+ vmovaps 32(%r10), %ymm12
+ vmovaps 64(%r10), %ymm13
+ vmovaps 96(%r10), %ymm14
+
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vblendps $0x3, %ymm13, %ymm2, %ymm2
+ vblendps $0x7, %ymm14, %ymm3, %ymm3
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_lib8, .-inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_vs_lib8, @function
+inner_store_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ cmpl $2, %r12d
+ jl 0f // end
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ cmpl $3, %r12d
+ jl 0f // end
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x3, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ je 0f // end
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x7, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ //
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_vs_lib8, .-inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X4_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x4_gen_lib8, @function
+inner_store_l_8x4_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x4_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x4_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ cmpl $2, %r15d
+ jl 7f // end
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x1, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ cmpl $3, %r15d
+ jl 7f // end
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x3, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ je 7f // end
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x7, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x4_gen_lib8, .-inner_store_l_8x4_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_lib8
+ .type kernel_sgemm_nt_8x4_lib8, @function
+kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_lib8
+_kernel_sgemm_nt_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_lib8
+ .def kernel_sgemm_nt_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_lib8, .-kernel_sgemm_nt_8x4_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_4x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_lib8
+ .type kernel_sgemm_nt_4x8_lib8, @function
+kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_lib8
+_kernel_sgemm_nt_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_lib8
+ .def kernel_sgemm_nt_4x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG3, %r12 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_lib8, .-kernel_sgemm_nt_4x8_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_vs_lib8
+ .type kernel_sgemm_nt_8x4_vs_lib8, @function
+kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_vs_lib8
+_kernel_sgemm_nt_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_vs_lib8
+ .def kernel_sgemm_nt_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_vs_lib8, .-kernel_sgemm_nt_8x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_sgemm_nt_4x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_vs_lib8
+ .type kernel_sgemm_nt_4x8_vs_lib8, @function
+kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_vs_lib8
+_kernel_sgemm_nt_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_vs_lib8
+ .def kernel_sgemm_nt_4x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // B
+ movq ARG3, %r12 // A
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_vs_lib8, .-kernel_sgemm_nt_4x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_8x4_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x4_gen_lib8
+ .type kernel_sgemm_nt_8x4_gen_lib8, @function
+kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x4_gen_lib8
+_kernel_sgemm_nt_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x4_gen_lib8
+ .def kernel_sgemm_nt_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x4_gen_lib8, .-kernel_sgemm_nt_8x4_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_4x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_4x8_gen_lib8
+ .type kernel_sgemm_nt_4x8_gen_lib8, @function
+kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_4x8_gen_lib8
+_kernel_sgemm_nt_4x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_4x8_gen_lib8
+ .def kernel_sgemm_nt_4x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_4x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG4, %r11 // A
+ movq ARG3, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_AB_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_ab_4x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_ab_4x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_4x8_gen_lib8, .-kernel_sgemm_nt_4x8_gen_lib8
+#endif
+
+
+
+
+
+// 0 1 2 3 4 5 6 7 8
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_lib8
+ .type kernel_sgemm_nn_8x4_lib8, @function
+kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_lib8
+_kernel_sgemm_nn_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_lib8
+ .def kernel_sgemm_nn_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_lib8, .-kernel_sgemm_nn_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_sgemm_nn_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_vs_lib8
+ .type kernel_sgemm_nn_8x4_vs_lib8, @function
+kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_vs_lib8
+_kernel_sgemm_nn_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_vs_lib8
+ .def kernel_sgemm_nn_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_vs_lib8, .-kernel_sgemm_nn_8x4_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_sgemm_nn_8x4_gen_lib8(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x4_gen_lib8
+ .type kernel_sgemm_nn_8x4_gen_lib8, @function
+kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x4_gen_lib8
+_kernel_sgemm_nn_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x4_gen_lib8
+ .def kernel_sgemm_nn_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x4_gen_lib8, .-kernel_sgemm_nn_8x4_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_ssyrk_nt_l_8x4_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x4_lib8
+ .type kernel_ssyrk_nt_l_8x4_lib8, @function
+kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x4_lib8
+_kernel_ssyrk_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x4_lib8
+ .def kernel_ssyrk_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x4_lib8, .-kernel_ssyrk_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_nt_l_8x4_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x4_vs_lib8
+ .type kernel_ssyrk_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x4_vs_lib8
+_kernel_ssyrk_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x4_vs_lib8
+ .def kernel_ssyrk_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x4_vs_lib8, .-kernel_ssyrk_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_strsm_nt_rl_inv_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x4_lib8
+ .type kernel_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x4_lib8
+_kernel_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x4_lib8
+ .def kernel_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x4_lib8, .-kernel_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_strsm_nt_rl_inv_4x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_4x8_lib8
+ .type kernel_strsm_nt_rl_inv_4x8_lib8, @function
+kernel_strsm_nt_rl_inv_4x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_4x8_lib8
+_kernel_strsm_nt_rl_inv_4x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_4x8_lib8
+ .def kernel_strsm_nt_rl_inv_4x8_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_4x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG3, %r11
+ movq ARG2, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq $8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_4x8_lib8, .-kernel_strsm_nt_rl_inv_4x8_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_strsm_nt_rl_inv_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+ .type kernel_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x4_vs_lib8
+ .def kernel_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_strsm_nt_rl_inv_4x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_4x8_vs_lib8
+ .type kernel_strsm_nt_rl_inv_4x8_vs_lib8, @function
+kernel_strsm_nt_rl_inv_4x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_4x8_vs_lib8
+_kernel_strsm_nt_rl_inv_4x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_4x8_vs_lib8
+ .def kernel_strsm_nt_rl_inv_4x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_4x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG3, %r11
+ movq ARG2, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_TRAN_SCALE_11_4X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_tran_scale_11_4x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_tran_scale_11_4x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_4x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_4x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_4x8_vs_lib8, .-kernel_strsm_nt_rl_inv_4x8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x4_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x4_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6
+// void kernel_spotrf_nt_l_8x4_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x4_lib8
+ .type kernel_spotrf_nt_l_8x4_lib8, @function
+kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x4_lib8
+_kernel_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x4_lib8
+ .def kernel_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x4_lib8, .-kernel_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_spotrf_nt_l_8x4_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x4_vs_lib8
+ .type kernel_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x4_vs_lib8
+_kernel_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x4_vs_lib8
+ .def kernel_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // m1
+ movq ARG8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x4_vs_lib8, .-kernel_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_spotrf_nt_l_8x4_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x4_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x4_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x4_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x4_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x4_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x4_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x4_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_strmm_nn_rl_8x4_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_lib8
+ .type kernel_strmm_nn_rl_8x4_lib8, @function
+kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_lib8
+_kernel_strmm_nn_rl_8x4_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_lib8
+ .def kernel_strmm_nn_rl_8x4_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_lib8, .-kernel_strmm_nn_rl_8x4_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_strmm_nn_rl_8x4_vs_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_vs_lib8
+ .type kernel_strmm_nn_rl_8x4_vs_lib8, @function
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_vs_lib8
+_kernel_strmm_nn_rl_8x4_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_vs_lib8
+ .def kernel_strmm_nn_rl_8x4_vs_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_vs_lib8, .-kernel_strmm_nn_rl_8x4_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13
+// void kernel_strmm_nn_rl_8x4_gen_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strmm_nn_rl_8x4_gen_lib8
+ .type kernel_strmm_nn_rl_8x4_gen_lib8, @function
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strmm_nn_rl_8x4_gen_lib8
+_kernel_strmm_nn_rl_8x4_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strmm_nn_rl_8x4_gen_lib8
+ .def kernel_strmm_nn_rl_8x4_gen_lib8; .scl 2; .type 32; .endef
+kernel_strmm_nn_rl_8x4_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovapd %ymm0, %ymm1
+ vmovapd %ymm0, %ymm2
+ vmovapd %ymm0, %ymm3
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRMM_NN_RL_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trmm_nn_rl_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trmm_nn_rl_8x4_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x4_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_8X4_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_8x4_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_8x4_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // offsetD
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG10, %r13 // m0
+ movq ARG11, %r14 // m1
+ movq ARG12, %r15 // n0
+ movq ARG13, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X4_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x4_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x4_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strmm_nn_rl_8x4_gen_lib8, .-kernel_strmm_nn_rl_8x4_gen_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC04: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/avx2/kernel_sgemm_8x8_lib8.S b/kernel/avx2/kernel_sgemm_8x8_lib8.S
new file mode 100644
index 0000000..094acda
--- /dev/null
+++ b/kernel/avx2/kernel_sgemm_8x8_lib8.S
@@ -0,0 +1,5395 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp); \
+ vzeroupper;
+#define EPILOGUE \
+ vzeroupper; \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nt_8x8_lib8, @function
+inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nt_8x8_lib8:
+#endif
+#endif
+
+
+// broadcast scheme
+#if 1
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+
+ cmpl $3, %r10d
+ jle 4f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm13 // A
+ vbroadcastss 0(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 4(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 8(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 12(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 16(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 20(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 24(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 28(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vmovaps 32(%r11), %ymm13 // A
+ vbroadcastss 32(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 36(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 40(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 44(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 48(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 52(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 56(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 60(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vmovaps -64(%r11), %ymm13 // A
+ vbroadcastss 64(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 68(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 72(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 76(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 80(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 84(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 88(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 92(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm7
+ addq $128, %r12
+
+ // unroll 0
+ vmovaps -32(%r11), %ymm13 // A
+ vbroadcastss -32(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss -28(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss -24(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss -20(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss -16(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss -12(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss -8(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss -4(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm13 // a
+ vbroadcastss 0(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 4(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm1
+ subl $1, %r10d
+ vbroadcastss 8(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm2
+ addq $32, %r11
+ vbroadcastss 12(%r12), %ymm12 // b
+ vfmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 16(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 20(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 24(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 28(%r12), %ymm12 // B
+ vfmadd231ps %ymm13, %ymm12, %ymm7
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+// shuffle scheme
+#else
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 32(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vfmadd231ps %ymm13, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm3
+ vbroadcastf128 64(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 96(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+ vbroadcastf128 112(%r12), %ymm15 // B
+ vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vfmadd231ps %ymm13, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm3
+ vbroadcastf128 0(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vbroadcastf128 16(%r12), %ymm15 // B
+ vmovaps 32(%r11), %ymm13 // A
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 32(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+ vbroadcastf128 48(%r12), %ymm15 // B
+ vmovaps 64(%r11), %ymm12 // A
+
+
+ // unroll 1
+ vfmadd231ps %ymm13, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm3
+ vbroadcastf128 64(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+ vbroadcastf128 80(%r12), %ymm15 // B
+ vmovaps 96(%r11), %ymm13 // A
+
+
+ // unroll 2
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+ vbroadcastf128 96(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+ vbroadcastf128 112(%r12), %ymm15 // B
+// vmovaps 128(%r11), %ymm12 // A
+
+ subl $4, %r10d
+ addq $128, %r11
+ addq $128, %r12
+
+ // unroll 3
+ vfmadd231ps %ymm13, %ymm14, %ymm0
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm1
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm2
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+
+ vfmadd231ps %ymm13, %ymm14, %ymm3
+// vbroadcastf128 0(%r12), %ymm14 // B
+
+ vfmadd231ps %ymm13, %ymm15, %ymm4
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm5
+ vshufps $0x4e, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm6
+ vshufps $0xb1, %ymm15, %ymm15, %ymm15
+
+ vfmadd231ps %ymm13, %ymm15, %ymm7
+// vbroadcastf128 16(%r12), %ymm15 // B
+// vmovaps 32(%r11), %ymm13 // A
+
+
+// cmpl $4, %r10d
+ jmp 2f // return
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vbroadcastf128 0(%r12), %ymm14 // B
+ vmovaps 0(%r11), %ymm12 // A
+ vfmadd231ps %ymm12, %ymm14, %ymm0
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vfmadd231ps %ymm12, %ymm14, %ymm1
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vfmadd231ps %ymm12, %ymm14, %ymm2
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vfmadd231ps %ymm12, %ymm14, %ymm3
+
+ vbroadcastf128 16(%r12), %ymm14 // B
+ vfmadd231ps %ymm12, %ymm14, %ymm4
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vfmadd231ps %ymm12, %ymm14, %ymm5
+
+ vshufps $0x4e, %ymm14, %ymm14, %ymm14
+ vfmadd231ps %ymm12, %ymm14, %ymm6
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $32, %r12
+
+ vshufps $0xb1, %ymm14, %ymm14, %ymm14
+ vfmadd231ps %ymm12, %ymm14, %ymm7
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#endif
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nt_8x8_lib8, .-inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73]
+// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72]
+// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70]
+// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71]
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_sub_nt_8x8_lib8, @function
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_sub_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_sub_nt_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_sub_nt_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // preload
+
+ cmpl $3, %r10d
+ jle 4f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm13 // A
+ vbroadcastss 0(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 4(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 8(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 12(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 16(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 20(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 24(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 28(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm7
+ subl $4, %r10d
+
+ // unroll 0
+ vmovaps 32(%r11), %ymm13 // A
+ vbroadcastss 32(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 36(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 40(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 44(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 48(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 52(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 56(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 60(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm7
+ addq $128, %r11
+
+ // unroll 0
+ vmovaps -64(%r11), %ymm13 // A
+ vbroadcastss 64(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 68(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss 72(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss 76(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 80(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 84(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 88(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 92(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm7
+ addq $128, %r12
+
+ // unroll 0
+ vmovaps -32(%r11), %ymm13 // A
+ vbroadcastss -32(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss -28(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ vbroadcastss -24(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ vbroadcastss -20(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss -16(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss -12(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss -8(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss -4(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm7
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm13 // a
+ vbroadcastss 0(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm0
+ vbroadcastss 4(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm1
+ subl $1, %r10d
+ vbroadcastss 8(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm2
+ addq $32, %r11
+ vbroadcastss 12(%r12), %ymm12 // b
+ vfnmadd231ps %ymm13, %ymm12, %ymm3
+ vbroadcastss 16(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm4
+ vbroadcastss 20(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm5
+ vbroadcastss 24(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm6
+ vbroadcastss 28(%r12), %ymm12 // B
+ vfnmadd231ps %ymm13, %ymm12, %ymm7
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_sub_nt_8x8_lib8, .-inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// r14 <= dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_gemm_add_nn_8x8_lib8, @function
+inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_kernel_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_kernel_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ cmpl $8, %r10d
+ jl 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+// prefetcht0 0(%r12, %r13, 1) // software prefetch
+// prefetcht0 64(%r12, %r13, 1) // software prefetch
+// prefetcht0 128(%r12, %r13, 1) // software prefetch
+// prefetcht0 192(%r12, %r13, 1) // software prefetch
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 1
+ vmovaps 32(%r11), %ymm12 // A[0]
+ vbroadcastss 4(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 36(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 68(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 100(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 132(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 164(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 196(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 228(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 2
+ vmovaps 64(%r11), %ymm12 // A[0]
+ vbroadcastss 8(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 40(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 72(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 104(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 136(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 168(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 200(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 232(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 3
+ vmovaps 96(%r11), %ymm12 // A[0]
+ vbroadcastss 12(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 44(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 76(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 108(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 140(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 172(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 204(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 236(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 4
+ vmovaps 128(%r11), %ymm12 // A[0]
+ vbroadcastss 16(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 48(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 80(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 112(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 144(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 176(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 208(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 240(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ // unroll 5
+ vmovaps 160(%r11), %ymm12 // A[0]
+ vbroadcastss 20(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 52(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 84(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 116(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 148(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 180(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 212(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 244(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+ subl $8, %r10d
+
+ // unroll 6
+ vmovaps 192(%r11), %ymm12 // A[0]
+ vbroadcastss 24(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 56(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 88(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 120(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 152(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 184(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 216(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 248(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+ addq $256, %r11
+
+ // unroll 7
+ vmovaps -32(%r11), %ymm12 // A[0]
+ vbroadcastss 28(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 60(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 92(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 124(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 156(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 188(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 220(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 252(%r12), %ymm13 // B[7]
+ addq %r12, %r13
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ cmpl $7, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+3: // clean1-up loop
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ subl $1, %r10d
+ addq $32, %r11
+ addq $4, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_gemm_add_nn_8x8_lib8, .-inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_gemm_add_nn_8x8_lib8, @function
+inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_edge_gemm_add_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_gemm_add_nn_8x8_lib8; .scl 2; .type 32; .endef
+inner_edge_gemm_add_nn_8x8_lib8:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $8, %ebx
+ subl %r14d, %ebx // 8-offsetB
+ cmpl %r10d, %ebx
+// jle 0f
+// movl %r10d, %ebx // kend=min(k,8-offsetB)
+//0:
+ cmovgl %r10d, %ebx // kend=min(k,8-offsetB)
+
+ movl %r14d, %eax
+ sall $2, %eax // offsetB*sizeof(float)
+ addq %rax, %r12 // B+offsetB*sizeof(float)
+
+ // unroll 0
+ vmovaps 0(%r11), %ymm12 // A[0]
+ vbroadcastss 0(%r12), %ymm13 // B[0]
+ vfmadd231ps %ymm12, %ymm13, %ymm0
+ vbroadcastss 32(%r12), %ymm13 // B[1]
+ vfmadd231ps %ymm12, %ymm13, %ymm1
+ vbroadcastss 64(%r12), %ymm13 // B[2]
+ vfmadd231ps %ymm12, %ymm13, %ymm2
+ vbroadcastss 96(%r12), %ymm13 // B[3]
+ vfmadd231ps %ymm12, %ymm13, %ymm3
+ vbroadcastss 128(%r12), %ymm13 // B[4]
+ vfmadd231ps %ymm12, %ymm13, %ymm4
+ vbroadcastss 160(%r12), %ymm13 // B[5]
+ vfmadd231ps %ymm12, %ymm13, %ymm5
+ vbroadcastss 192(%r12), %ymm13 // B[6]
+ vfmadd231ps %ymm12, %ymm13, %ymm6
+ vbroadcastss 224(%r12), %ymm13 // B[7]
+ vfmadd231ps %ymm12, %ymm13, %ymm7
+
+ subl $1, %r10d // k-1
+ subl $1, %ebx // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $4, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %ebx
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(float)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_gemm_add_nn_8x8_lib8, .-inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// strsm
+// right
+// lower
+// transposed
+// not-unit
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_trsm_rlt_inv_8x8_vs_lib8, @function
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_trsm_rlt_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_trsm_rlt_inv_8x8_vs_lib8:
+#endif
+#endif
+
+ vbroadcastss 0(%r11), %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vbroadcastss 4(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vbroadcastss 8(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vbroadcastss 12(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vbroadcastss 16(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm4
+ vbroadcastss 20(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm5
+ vbroadcastss 24(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm6
+ vbroadcastss 28(%r10), %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm7
+
+ vbroadcastss 4(%r11), %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vbroadcastss 40(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vbroadcastss 44(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vbroadcastss 48(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm4
+ vbroadcastss 52(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm5
+ vbroadcastss 56(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm6
+ vbroadcastss 60(%r10), %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm7
+
+ vbroadcastss 8(%r11), %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vbroadcastss 76(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vbroadcastss 80(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm4
+ vbroadcastss 84(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm5
+ vbroadcastss 88(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm6
+ vbroadcastss 92(%r10), %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm7
+
+ vbroadcastss 12(%r11), %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vbroadcastss 112(%r10), %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm4
+ vbroadcastss 116(%r10), %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm5
+ vbroadcastss 120(%r10), %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm6
+ vbroadcastss 124(%r10), %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm7
+
+ vbroadcastss 16(%r11), %ymm13
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $6, %r12d
+ jl 0f // ret
+ vbroadcastss 148(%r10), %ymm13
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vbroadcastss 152(%r10), %ymm13
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vbroadcastss 156(%r10), %ymm13
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+
+ vbroadcastss 20(%r11), %ymm13
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $7, %r12d
+ jl 0f // ret
+ vbroadcastss 184(%r10), %ymm13
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vbroadcastss 188(%r10), %ymm13
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+
+ vbroadcastss 24(%r11), %ymm13
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $8, %r12d
+ jl 0f // ret
+ vbroadcastss 220(%r10), %ymm13
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+
+ vbroadcastss 28(%r11), %ymm13
+ vmulps %ymm7, %ymm13, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_trsm_rlt_inv_8x8_vs_lib8, .-inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm12 <- dirty
+// ymm13 <- dirty
+// ymm14 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_potrf_8x8_vs_lib8, @function
+inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_edge_potrf_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_potrf_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_edge_potrf_8x8_vs_lib8:
+#endif
+#endif
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovss .LC03(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovss LC03(%rip), %xmm14 // 1.0
+#endif
+
+ vmovss %xmm0, %xmm0, %xmm13
+ vucomiss %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+2:
+ vmovss %xmm13, 0(%r10)
+ vbroadcastss %xmm13, %ymm13
+// vpermilps $0x00, %xmm13, %xmm13
+// vinsertf128 $0x1, %xmm13, %ymm13, %ymm13
+ vmulps %ymm0, %ymm13, %ymm0
+ vperm2f128 $0x00, %ymm0, %ymm0, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm1
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm3
+ vperm2f128 $0x11, %ymm0, %ymm0, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm0, %ymm13, %ymm7
+
+
+ vpermilps $0x55, %xmm1, %xmm13
+ vucomiss %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+4:
+ vmovss %xmm13, 4(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm1, %ymm13, %ymm1
+ vperm2f128 $0x00, %ymm1, %ymm1, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm2
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm3
+ vperm2f128 $0x11, %ymm1, %ymm1, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm1, %ymm13, %ymm7
+
+
+ vpermilps $0xaa, %xmm2, %xmm13
+ vucomiss %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+6:
+ vmovss %xmm13, 8(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm2, %ymm13, %ymm2
+ vperm2f128 $0x00, %ymm2, %ymm2, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm3
+ vperm2f128 $0x11, %ymm2, %ymm2, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm2, %ymm13, %ymm7
+
+
+ vpermilps $0xff, %xmm3, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+8:
+ vmovss %xmm13, 12(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm3, %ymm13, %ymm3
+ vperm2f128 $0x11, %ymm3, %ymm3, %ymm11
+ vpermilps $0x00, %ymm11, %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm4
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm3, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm4, %xmm13
+// vpermilps $0x00, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 9f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+10:
+ vmovss %xmm13, 16(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm4, %ymm13, %ymm4
+ cmpl $6, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm4, %ymm4, %ymm11
+ vpermilps $0x55, %ymm11, %ymm13
+ vfnmadd231ps %ymm4, %ymm13, %ymm5
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm4, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm4, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm5, %xmm13
+ vpermilps $0x55, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 11f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+12:
+ vmovss %xmm13, 20(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm5, %ymm13, %ymm5
+ cmpl $7, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm5, %ymm5, %ymm11
+ vpermilps $0xaa, %ymm11, %ymm13
+ vfnmadd231ps %ymm5, %ymm13, %ymm6
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm5, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm6, %xmm13
+ vpermilps $0xaa, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 13f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+14:
+ vmovss %xmm13, 24(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm6, %ymm13, %ymm6
+ cmpl $8, %r11d
+ jl 0f // ret
+ vperm2f128 $0x11, %ymm6, %ymm6, %ymm11
+ vpermilps $0xff, %ymm11, %ymm13
+ vfnmadd231ps %ymm6, %ymm13, %ymm7
+
+
+ vextractf128 $0x1, %ymm7, %xmm13
+ vpermilps $0xff, %xmm13, %xmm13
+ vucomiss %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 15f
+ vsqrtss %xmm13, %xmm13, %xmm13
+ vdivss %xmm13, %xmm14, %xmm13
+16:
+ vmovss %xmm13, 28(%r10)
+ vbroadcastss %xmm13, %ymm13
+ vmulps %ymm7, %ymm13, %ymm7
+
+
+ jmp 0f
+
+
+1:
+ vxorps %ymm13, %ymm13, %ymm13
+ jmp 2b
+
+3:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 4b
+
+5:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 6b
+
+7:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 8b
+
+9:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 10b
+
+11:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 12b
+
+13:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 14b
+
+15:
+ vxorpd %ymm13, %ymm13, %ymm13
+ jmp 16b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_potrf_8x8_vs_lib8, .-inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x8_lib8, @function
+inner_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+ vmovaps 128(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 160(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 192(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 224(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x8_lib8, .-inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// r15 <- n0 // col index: start from (inc)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_8x8_gen_lib8, @function
+inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vmovaps 128(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vmovaps 160(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vmovaps 192(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vmovaps 224(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_8x8_gen_lib8, .-inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x8_lib8, @function
+inner_scale_11_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x8_lib8; .scl 2; .type 32; .endef
+inner_scale_11_8x8_lib8:
+#endif
+#endif
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmovaps 128(%r10), %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r10), %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r10), %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r10), %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x8_lib8, .-inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_11_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_11_8x8_gen_lib8, @function
+inner_scale_11_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_scale_11_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_11_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_scale_11_8x8_gen_lib8:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+ vmovaps 128(%r11), %ymm12
+ vaddps %ymm4, %ymm12, %ymm4
+ vmovaps 160(%r11), %ymm12
+ vaddps %ymm5, %ymm12, %ymm5
+ vmovaps 192(%r11), %ymm12
+ vaddps %ymm6, %ymm12, %ymm6
+ vmovaps 224(%r11), %ymm12
+ vaddps %ymm7, %ymm12, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_11_8x8_gen_lib8, .-inner_scale_11_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x8_lib8, @function
+inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm14
+
+ vxorps %ymm15, %ymm15, %ymm15 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 0f // end
+
+ vmovaps 0(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm0
+ vmovaps 32(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm1
+ vmovaps 64(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm2
+ vmovaps 96(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm3
+ vmovaps 128(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm4
+ vmovaps 160(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm5
+ vmovaps 192(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm6
+ vmovaps 224(%r12), %ymm15
+ vfmadd231ps %ymm15, %ymm14, %ymm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x8_lib8, .-inner_blend_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- offset
+// r13 <- C
+// r14 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_8x8_gen_lib8, @function
+inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_ab_8x8_gen_lib8:
+#endif
+#endif
+
+ // alpha
+ vbroadcastss 0(%r10), %ymm11
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vmulps %ymm0, %ymm11, %ymm0
+ vmulps %ymm1, %ymm11, %ymm1
+ vmulps %ymm2, %ymm11, %ymm2
+ vmulps %ymm3, %ymm11, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmulps %ymm4, %ymm11, %ymm4
+ vmulps %ymm5, %ymm11, %ymm5
+ vmulps %ymm6, %ymm11, %ymm6
+ vmulps %ymm7, %ymm11, %ymm7
+
+ // beta
+ vbroadcastss 0(%r11), %ymm15
+
+ vxorps %ymm14, %ymm14, %ymm14 // 0.0
+
+ vucomiss %xmm15, %xmm14 // beta==0.0 ?
+ je 3f // end
+
+ cmpl $0, %r12d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm0
+ vmovaps 32(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm1
+ vmovaps 64(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm2
+ vmovaps 96(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm3
+ vmovaps 128(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm4
+ vmovaps 160(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm5
+ vmovaps 192(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm6
+ vmovaps 224(%r13), %ymm12
+ vfmadd231ps %ymm12, %ymm15, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r12d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r12d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r12d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_8x8_gen_lib8, .-inner_blend_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend for generic alpha=1.0 and beta=1.0
+//
+// input arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- C
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x8_lib8, @function
+inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x8_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ vmovaps 0(%r10), %ymm15
+ vaddps %ymm0, %ymm15, %ymm0
+ vmovaps 32(%r10), %ymm15
+ vaddps %ymm1, %ymm15, %ymm1
+ vmovaps 64(%r10), %ymm15
+ vaddps %ymm2, %ymm15, %ymm2
+ vmovaps 96(%r10), %ymm15
+ vaddps %ymm3, %ymm15, %ymm3
+ vmovaps 128(%r10), %ymm15
+ vaddps %ymm4, %ymm15, %ymm4
+ vmovaps 160(%r10), %ymm15
+ vaddps %ymm5, %ymm15, %ymm5
+ vmovaps 192(%r10), %ymm15
+ vaddps %ymm6, %ymm15, %ymm6
+ vmovaps 224(%r10), %ymm15
+ vaddps %ymm7, %ymm15, %ymm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x8_lib8, .-inner_blend_scale_11_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+//
+// output arguments:
+// r10 <- offset
+// r11 <- C
+// r12 <- 4*sdc*sizeof(double)
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+// ymm8 <- dirty
+// ymm9 <- dirty
+// ymm10 <- dirty
+// ymm11 <- dirty
+// ymm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_8x8_gen_lib8, @function
+inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_blend_scale_11_8x8_gen_lib8:
+#endif
+#endif
+
+ vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
+ vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
+ vblendps $0xaa, %ymm3, %ymm2, %ymm14
+ vblendps $0x55, %ymm3, %ymm2, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
+ vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
+ vblendps $0xcc, %ymm14, %ymm13, %ymm1
+ vblendps $0x33, %ymm14, %ymm13, %ymm3
+
+ vblendps $0xaa, %ymm5, %ymm4, %ymm12
+ vblendps $0x55, %ymm5, %ymm4, %ymm13
+ vblendps $0xaa, %ymm7, %ymm6, %ymm14
+ vblendps $0x55, %ymm7, %ymm6, %ymm15
+
+ vblendps $0xcc, %ymm15, %ymm12, %ymm4
+ vblendps $0x33, %ymm15, %ymm12, %ymm6
+ vblendps $0xcc, %ymm14, %ymm13, %ymm5
+ vblendps $0x33, %ymm14, %ymm13, %ymm7
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+
+ vmovaps 0(%r11), %ymm12
+ vaddps %ymm0, %ymm12, %ymm0
+ vmovaps 32(%r11), %ymm12
+ vaddps %ymm1, %ymm12, %ymm1
+ vmovaps 64(%r11), %ymm12
+ vaddps %ymm2, %ymm12, %ymm2
+ vmovaps 96(%r11), %ymm12
+ vaddps %ymm3, %ymm12, %ymm3
+ vmovaps 128(%r11), %ymm12
+ vaddps %ymm4, %ymm12, %ymm4
+ vmovaps 160(%r11), %ymm12
+ vaddps %ymm5, %ymm12, %ymm5
+ vmovaps 192(%r11), %ymm12
+ vaddps %ymm6, %ymm12, %ymm6
+ vmovaps 224(%r11), %ymm12
+ vaddps %ymm7, %ymm12, %ymm7
+
+ jmp 7f
+
+0:
+
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r13, %r15 // C0
+ addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_8x8_gen_lib8, .-inner_blend_scale_11_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_lib8, @function
+inner_store_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps %ymm1, 32(%r10)
+ vmovaps %ymm2, 64(%r10)
+ vmovaps %ymm3, 96(%r10)
+ vmovaps %ymm4, 128(%r10)
+ vmovaps %ymm5, 160(%r10)
+ vmovaps %ymm6, 192(%r10)
+ vmovaps %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_lib8, .-inner_store_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_vs_lib8, @function
+inner_store_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ vmaskmovps %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmaskmovps %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmaskmovps %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmaskmovps %ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_8x8_gen_lib8, @function
+inner_store_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_8x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ vmaskmovps %ymm4, %ymm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmaskmovps %ymm5, %ymm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmaskmovps %ymm6, %ymm15, 192(%r11)
+ je 7f // end
+ vmaskmovps %ymm7, %ymm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+//
+// output arguments:
+// r10 <- D
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_lib8, @function
+inner_store_l_8x8_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_lib8:
+#endif
+#endif
+
+ vmovaps %ymm0, 0(%r10)
+ vmovaps 32(%r10), %ymm14
+ vblendps $0x01, %ymm14, %ymm1, %ymm1
+ vmovaps %ymm1, 32(%r10)
+ vmovaps 64(%r10), %ymm14
+ vblendps $0x03, %ymm14, %ymm2, %ymm2
+ vmovaps %ymm2, 64(%r10)
+ vmovaps 96(%r10), %ymm14
+ vblendps $0x07, %ymm14, %ymm3, %ymm3
+ vmovaps %ymm3, 96(%r10)
+ vmovaps 128(%r10), %ymm14
+ vblendps $0x0f, %ymm14, %ymm4, %ymm4
+ vmovaps %ymm4, 128(%r10)
+ vmovaps 160(%r10), %ymm14
+ vblendps $0x1f, %ymm14, %ymm5, %ymm5
+ vmovaps %ymm5, 160(%r10)
+ vmovaps 192(%r10), %ymm14
+ vblendps $0x3f, %ymm14, %ymm6, %ymm6
+ vmovaps %ymm6, 192(%r10)
+ vmovaps 224(%r10), %ymm14
+ vblendps $0x7f, %ymm14, %ymm7, %ymm7
+ vmovaps %ymm7, 224(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_8x8_lib8, .-inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower vs
+//
+// input arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- D
+// r11 <- km
+// r12 <- kn
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_VS_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_vs_lib8, @function
+inner_store_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_vs_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r11d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm15, %ymm12, %ymm15
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r10)
+ vmovaps 32(%r10), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r10)
+ vmovaps 64(%r10), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r10)
+ vmovaps 96(%r10), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r10)
+ vmovaps 128(%r10), %ymm12
+ vblendps $0x0f, %ymm12, %ymm4, %ymm4
+ vmaskmovps %ymm4, %ymm15, 128(%r10)
+ cmpl $6, %r12d
+ jl 0f // end
+ vmovaps 160(%r10), %ymm12
+ vblendps $0x1f, %ymm12, %ymm5, %ymm5
+ vmaskmovps %ymm5, %ymm15, 160(%r10)
+ cmpl $7, %r12d
+ jl 0f // end
+ vmovaps 192(%r10), %ymm12
+ vblendps $0x3f, %ymm12, %ymm6, %ymm6
+ vmaskmovps %ymm6, %ymm15, 192(%r10)
+ je 0f // end
+ vmovaps 224(%r10), %ymm12
+ vblendps $0x7f, %ymm12, %ymm7, %ymm7
+ vmaskmovps %ymm7, %ymm15, 224(%r10)
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_vs_lib8, .-inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store lower generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// ymm0 <- []
+// ymm1 <- []
+// ymm2 <- []
+// ymm3 <- []
+// ymm4 <- []
+// ymm5 <- []
+// ymm6 <- []
+// ymm7 <- []
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_8X8_GEN_LIB8
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_8x8_gen_lib8, @function
+inner_store_l_8x8_gen_lib8:
+#elif defined(OS_MAC)
+_inner_store_l_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_8x8_gen_lib8; .scl 2; .type 32; .endef
+inner_store_l_8x8_gen_lib8:
+#endif
+#endif
+
+ // compute mask for rows
+ vcvtsi2ss %r13d, %xmm14, %xmm14
+ vcvtsi2ss %r14d, %xmm15, %xmm15
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovups .LC00(%rip), %ymm12
+#elif defined(OS_MAC)
+ vmovups LC00(%rip), %ymm12
+#endif
+ vshufps $0x00, %xmm14, %xmm14, %xmm14
+ vshufps $0x00, %xmm15, %xmm15, %xmm15
+ vinsertf128 $0x1, %xmm14, %ymm14, %ymm14
+ vinsertf128 $0x1, %xmm15, %ymm15, %ymm15
+ vsubps %ymm12, %ymm14, %ymm14
+ vsubps %ymm15, %ymm12, %ymm15
+ vandps %ymm14, %ymm15, %ymm15
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ vmovaps %ymm7, %ymm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm2, %ymm1
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ vmovaps %ymm6, %ymm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovaps %ymm1, %ymm0
+ vmovaps %ymm3, %ymm2
+ vmovaps %ymm4, %ymm3
+ vmovaps %ymm5, %ymm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $8, %eax
+ jle 0f
+ movl $8, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+ cmpl $0, %r10d
+ jg 0f
+
+ // offset==0
+ vmaskmovps %ymm0, %ymm15, 0(%r11)
+ vmovaps 32(%r11), %ymm12
+ vblendps $0x01, %ymm12, %ymm1, %ymm1
+ vmaskmovps %ymm1, %ymm15, 32(%r11)
+ vmovaps 64(%r11), %ymm12
+ vblendps $0x03, %ymm12, %ymm2, %ymm2
+ vmaskmovps %ymm2, %ymm15, 64(%r11)
+ vmovaps 96(%r11), %ymm12
+ vblendps $0x07, %ymm12, %ymm3, %ymm3
+ vmaskmovps %ymm3, %ymm15, 96(%r11)
+ vmovaps 128(%r11), %ymm12
+ vblendps $0x0f, %ymm12, %ymm4, %ymm4
+ vmaskmovps %ymm4, %ymm15, 128(%r11)
+ cmpl $6, %r15d
+ jl 7f // end
+ vmovaps 160(%r11), %ymm12
+ vblendps $0x1f, %ymm12, %ymm5, %ymm5
+ vmaskmovps %ymm5, %ymm15, 160(%r11)
+ cmpl $7, %r15d
+ jl 7f // end
+ vmovaps 192(%r11), %ymm12
+ vblendps $0x3f, %ymm12, %ymm6, %ymm6
+ vmaskmovps %ymm6, %ymm15, 192(%r11)
+ je 7f // end
+ vmovaps 224(%r11), %ymm12
+ vblendps $0x7f, %ymm12, %ymm7, %ymm7
+ vmaskmovps %ymm7, %ymm15, 224(%r11)
+ //
+ jmp 7f
+
+0:
+ // offset > 0
+ // 1 2 3 4 5 6 7
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $4, %r10d
+ jl 1f
+ jg 2f
+
+ // offset==4
+ // TODO
+ jmp 7f
+
+1:
+ // 1 2 3
+
+ cmpl $2, %r10d
+ jl 3f
+ jg 4f
+
+ // offset==2
+ // TODO
+ jmp 7f
+
+3:
+ // offset==1
+ // TODO
+ jmp 7f
+
+4:
+ // offset==3
+ // TODO
+ jmp 7f
+
+2:
+ // 5 6 7
+
+ cmpl $6, %r10d
+ jl 5f
+ jg 6f
+
+ // offset==6
+ // TODO
+ jmp 7f
+
+5:
+ // offset==5
+ // TODO
+ jmp 7f
+
+6:
+ // offset==7
+ // TODO
+ jmp 7f
+
+ // end
+7:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_8x8_gen_lib8, .-inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_lib8
+ .type kernel_sgemm_nt_8x8_lib8, @function
+kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_lib8
+_kernel_sgemm_nt_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_lib8
+ .def kernel_sgemm_nt_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_lib8, .-kernel_sgemm_nt_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemm_nt_8x8_vs)lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_vs_lib8
+ .type kernel_sgemm_nt_8x8_vs_lib8, @function
+kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_vs_lib8
+_kernel_sgemm_nt_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_vs_lib8
+ .def kernel_sgemm_nt_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // D
+ movq ARG9, %r12 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_vs_lib8, .-kernel_sgemm_nt_8x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72
+// void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nt_8x8_gen_lib8
+ .type kernel_sgemm_nt_8x8_gen_lib8, @function
+kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nt_8x8_gen_lib8
+_kernel_sgemm_nt_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nt_8x8_gen_lib8
+ .def kernel_sgemm_nt_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nt_8x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 8*sdc*sizeof(float)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 8*sdb*sizeof(float)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nt_8x8_gen_lib8, .-kernel_sgemm_nt_8x8_gen_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_lib8
+ .type kernel_sgemm_nn_8x8_lib8, @function
+kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_lib8
+_kernel_sgemm_nn_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_lib8
+ .def kernel_sgemm_nn_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_lib8, .-kernel_sgemm_nn_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_vs_lib8
+ .type kernel_sgemm_nn_8x8_vs_lib8, @function
+kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_vs_lib8
+_kernel_sgemm_nn_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_vs_lib8
+ .def kernel_sgemm_nn_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_vs_lib8, .-kernel_sgemm_nn_8x8_vs_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88
+// void kernel_sgemm_nn_8x8_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_nn_8x8_gen_lib8
+ .type kernel_sgemm_nn_8x8_gen_lib8, @function
+kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_nn_8x8_gen_lib8
+_kernel_sgemm_nn_8x8_gen_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_nn_8x8_gen_lib8
+ .def kernel_sgemm_nn_8x8_gen_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_nn_8x8_gen_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nn
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nn_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nn_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // offsetC
+ movq ARG9, %r13 // C
+ movq ARG10, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_gen_lib8
+#endif
+#endif
+
+
+ // store n gen
+
+ movq ARG11, %r10 // offsetD
+ movq ARG12, %r11 // D
+ movq ARG13, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG14, %r13 // m0
+ movq ARG15, %r14 // m1
+ movq ARG16, %r15 // n0
+ movq ARG17, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_GEN_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_gen_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_gen_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_nn_8x8_gen_lib8, .-kernel_sgemm_nn_8x8_gen_lib8
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x8_lib8
+ .type kernel_ssyrk_nt_l_8x8_lib8, @function
+kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x8_lib8
+_kernel_ssyrk_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x8_lib8
+ .def kernel_ssyrk_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x8_lib8, .-kernel_ssyrk_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_nt_l_8x8_vs_lib8
+ .type kernel_ssyrk_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_nt_l_8x8_vs_lib8
+_kernel_ssyrk_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_nt_l_8x8_vs_lib8
+ .def kernel_ssyrk_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_8x8_lib8
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_nt_l_8x8_vs_lib8, .-kernel_ssyrk_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x8_lib8
+ .type kernel_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x8_lib8
+_kernel_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x8_lib8
+ .def kernel_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq $8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x8_lib8, .-kernel_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+ .type kernel_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_strsm_nt_rl_inv_8x8_vs_lib8
+ .def kernel_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // m1
+ movq ARG9, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x8_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq $8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x8_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11 12
+// void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+ .type kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, @function
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+_kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+ .def kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_trsm_rlt_inv_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8, .-kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9
+// void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x8_lib8
+ .type kernel_spotrf_nt_l_8x8_lib8, @function
+kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x8_lib8
+_kernel_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x8_lib8
+ .def kernel_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movl $8, %r11d // n1
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x8_lib8, .-kernel_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// edi rsi rdx rcx r8 r9 rsp+8 rsp+16
+// void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_spotrf_nt_l_8x8_vs_lib8
+ .type kernel_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_spotrf_nt_l_8x8_vs_lib8
+_kernel_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_spotrf_nt_l_8x8_vs_lib8
+ .def kernel_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_spotrf_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // m1
+ movq ARG8, %r12 // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_spotrf_nt_l_8x8_vs_lib8, .-kernel_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x8_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x8_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x8_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorps %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $8, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x8_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_lib8
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+ .type kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, @function
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_MAC)
+ .globl _kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+_kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#elif defined(OS_WINDOWS)
+ .globl kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+ .def kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8; .scl 2; .type 32; .endef
+kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ vxorpd %ymm0, %ymm0, %ymm0
+ vmovaps %ymm0, %ymm1
+ vmovaps %ymm0, %ymm2
+ vmovaps %ymm0, %ymm3
+ vmovaps %ymm0, %ymm4
+ vmovaps %ymm0, %ymm5
+ vmovaps %ymm0, %ymm6
+ vmovaps %ymm0, %ymm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_add_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_add_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_gemm_sub_nt_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_kernel_gemm_sub_nt_8x8_lib8
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_11_8X8_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_11_8x8_lib8
+#elif defined(OS_MAC)
+ callq _inner_scale_11_8x8_lib8
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_POTRF_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_potrf_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_edge_potrf_8x8_vs_lib8
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_8X8_VS_LIB8
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_8x8_vs_lib8
+#elif defined(OS_MAC)
+ callq _inner_store_l_8x8_vs_lib8
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8, .-kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
+#endif
+ .long 1056964608
+ .long 1069547520
+ .long 1075838976
+ .long 1080033280
+ .long 1083179008
+ .long 1085276160
+ .long 1087373312
+ .long 1089470464
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
+#endif
+ .long 1091043328
+ .long 1092091904
+ .long 1093140480
+ .long 1094189056
+ .long 1095237632
+ .long 1096286208
+ .long 1097334784
+ .long 1098383360
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#elif defined(OS_MAC)
+ .align 5
+LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
+#endif
+ .long 1099169792
+ .long 1099694080
+ .long 1100218368
+ .long 1100742656
+ .long 1101266944
+ .long 1101791232
+ .long 1102315520
+ .long 1102839808
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+ .align 5
+LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
+#endif
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 1065353216
+ .long 3212836864
+ .long 3212836864
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/c99/Makefile b/kernel/c99/Makefile
new file mode 100644
index 0000000..55d54ef
--- /dev/null
+++ b/kernel/c99/Makefile
@@ -0,0 +1,80 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_HASWELL)
+OBJS += kernel_dgemv_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o
+endif
+
+ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE)
+OBJS += kernel_dgemv_4_lib4.o
+#OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o
+OBJS +=
+endif
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+ifeq ($(TARGET), GENERIC)
+OBJS += kernel_dgemm_4x4_lib4.o kernel_dgemm_diag_lib4.o kernel_dgemv_4_lib4.o kernel_dsymv_4_lib4.o kernel_dgetrf_pivot_4_lib4.o kernel_dgeqrf_4_lib4.o
+OBJS += kernel_sgemm_4x4_lib4.o kernel_sgemm_diag_lib4.o kernel_sgemv_4_lib4.o kernel_ssymv_4_lib4.o kernel_sgetrf_pivot_4_lib4.o kernel_sgecp_lib4.o
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/c99/kernel_dgemm_4x4_lib4.c b/kernel/c99/kernel_dgemm_4x4_lib4.c
new file mode 100644
index 0000000..167e356
--- /dev/null
+++ b/kernel/c99/kernel_dgemm_4x4_lib4.c
@@ -0,0 +1,6825 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+//#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nt_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ double
+ *C1, *D1;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(offsetC==0)
+ {
+ c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==1)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==2)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+ }
+ else //if(offsetC==3)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+ }
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nt_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC)
+void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+ kernel_dgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nn_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ double
+ *C1, *D1;
+
+ int k;
+
+ k = 0;
+ if(offsetB!=0)
+ {
+ if(offsetB==1)
+ {
+
+ B += 1;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto scale;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto scale;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ B += bs*(sdb-1);
+ k += 1;
+
+ }
+ else if(offsetB==2)
+ {
+
+ B += 2;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto scale;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ B += bs*(sdb-1);
+ k += 1;
+
+ }
+ else // if(offsetB==3)
+ {
+
+ B += 3;
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ B += bs*(sdb-1);
+ k += 1;
+
+ }
+ }
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[1];
+ b_1 = B[5];
+ b_2 = B[9];
+ b_3 = B[13];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[2];
+ b_1 = B[6];
+ b_2 = B[10];
+ b_3 = B[14];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[3];
+ b_1 = B[7];
+ b_2 = B[11];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+
+ }
+
+ scale:
+
+ if(offsetC==0)
+ {
+ c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==1)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==2)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+ }
+ else //if(offsetC==3)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+ }
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
+ {
+ kernel_dgemm_nn_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, beta, 0, C, 0, 0, D, 0, 0, 4, 0, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0,
+ c_10=0, c_11=0,
+ c_20=0, c_21=0, c_22=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ double
+ *C1, *D1;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(offsetC==0)
+ {
+ c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==1)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==2)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+ }
+ else //if(offsetC==3)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+ }
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0,
+ c_10=0, c_11=0,
+ c_20=0, c_21=0, c_22=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[2+bs*2] = c_22;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[1+bs*1] = c_11;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_nt_l_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+ kernel_dsyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nt_ru_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ k = 0;
+
+ // k = 0
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ // k = 1
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ // k = 2
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+ kernel_dtrmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nn_rl_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ double *D1;
+
+ int k;
+
+ B += offsetB;
+
+ k = 0;
+
+ if(offsetB==0)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else if(offsetB==1)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else if(offsetB==2)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 4
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 5
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else // if(offetB==3)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 4
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[1];
+ b_1 = B[5];
+ b_2 = B[9];
+ b_3 = B[13];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[2];
+ b_1 = B[6];
+ b_2 = B[10];
+ b_3 = B[14];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[3];
+ b_1 = B[7];
+ b_2 = B[11];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+
+ }
+
+ store:
+
+ c_00 = alpha[0]*c_00;
+ c_10 = alpha[0]*c_10;
+ c_20 = alpha[0]*c_20;
+ c_30 = alpha[0]*c_30;
+
+ c_01 = alpha[0]*c_01;
+ c_11 = alpha[0]*c_11;
+ c_21 = alpha[0]*c_21;
+ c_31 = alpha[0]*c_31;
+
+ c_02 = alpha[0]*c_02;
+ c_12 = alpha[0]*c_12;
+ c_22 = alpha[0]*c_22;
+ c_32 = alpha[0]*c_32;
+
+ c_03 = alpha[0]*c_03;
+ c_13 = alpha[0]*c_13;
+ c_23 = alpha[0]*c_23;
+ c_33 = alpha[0]*c_33;
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmm_nn_rl_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *D)
+ {
+ kernel_dtrmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dpotrf_nt_l_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, //c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, //c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, //c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+// c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+// c_02 = C[0+bs*2] + c_02;
+// c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+// c_03 = C[0+bs*3] + c_03;
+// c_13 = C[1+bs*3] + c_13;
+// c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ if(c_00>0)
+ {
+ c_00 = sqrt(c_00);
+ tmp = 1.0/c_00;
+ }
+ else
+ {
+ c_00 = 0.0;
+ tmp = 0.0;
+ }
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+ inv_diag_D[0] = tmp;
+
+ if(kn==1)
+ goto store;
+
+ c_11 -= c_10 * c_10;
+ c_21 -= c_20 * c_10;
+ c_31 -= c_30 * c_10;
+ if(c_11>0)
+ {
+ c_11 = sqrt(c_11);
+ tmp = 1.0/c_11;
+ }
+ else
+ {
+ c_11 = 0.0;
+ tmp = 0.0;
+ }
+ c_21 *= tmp;
+ c_31 *= tmp;
+ inv_diag_D[1] = tmp;
+
+ if(kn==2)
+ goto store;
+
+ c_22 -= c_20 * c_20;
+ c_32 -= c_30 * c_20;
+ c_22 -= c_21 * c_21;
+ c_32 -= c_31 * c_21;
+ if(c_22>0)
+ {
+ c_22 = sqrt(c_22);
+ tmp = 1.0/c_22;
+ }
+ else
+ {
+ c_22 = 0.0;
+ tmp = 0.0;
+ }
+ c_32 *= tmp;
+ inv_diag_D[2] = tmp;
+
+ if(kn==3)
+ goto store;
+
+ c_33 -= c_30 * c_30;
+ c_33 -= c_31 * c_31;
+ c_33 -= c_32 * c_32;
+ if(c_33>0)
+ {
+ c_33 = sqrt(c_33);
+ tmp = 1.0/c_33;
+ }
+ else
+ {
+ c_33 = 0.0;
+ tmp = 0.0;
+ }
+ inv_diag_D[3] = tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+// if(kn==1)
+// return;
+
+// D[0+bs*1] = c_01;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dpotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D)
+ {
+ kernel_dpotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn)
+ {
+ double alpha = 1.0;
+ double beta = 1.0;
+ kernel_dsyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+ kernel_dpotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D)
+ {
+ double alpha = 1.0;
+ double beta = 1.0;
+ kernel_dsyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+ kernel_dpotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ tmp = inv_diag_E[0];
+ c_00 *= tmp;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+ if(kn==1)
+ goto store;
+
+ tmp = E[1+bs*0];
+ c_01 -= c_00 * tmp;
+ c_11 -= c_10 * tmp;
+ c_21 -= c_20 * tmp;
+ c_31 -= c_30 * tmp;
+ tmp = inv_diag_E[1];
+ c_01 *= tmp;
+ c_11 *= tmp;
+ c_21 *= tmp;
+ c_31 *= tmp;
+
+ if(kn==2)
+ goto store;
+
+ tmp = E[2+bs*0];
+ c_02 -= c_00 * tmp;
+ c_12 -= c_10 * tmp;
+ c_22 -= c_20 * tmp;
+ c_32 -= c_30 * tmp;
+ tmp = E[2+bs*1];
+ c_02 -= c_01 * tmp;
+ c_12 -= c_11 * tmp;
+ c_22 -= c_21 * tmp;
+ c_32 -= c_31 * tmp;
+ tmp = inv_diag_E[2];
+ c_02 *= tmp;
+ c_12 *= tmp;
+ c_22 *= tmp;
+ c_32 *= tmp;
+
+ if(kn==3)
+ goto store;
+
+ tmp = E[3+bs*0];
+ c_03 -= c_00 * tmp;
+ c_13 -= c_10 * tmp;
+ c_23 -= c_20 * tmp;
+ c_33 -= c_30 * tmp;
+ tmp = E[3+bs*1];
+ c_03 -= c_01 * tmp;
+ c_13 -= c_11 * tmp;
+ c_23 -= c_21 * tmp;
+ c_33 -= c_31 * tmp;
+ tmp = E[3+bs*2];
+ c_03 -= c_02 * tmp;
+ c_13 -= c_12 * tmp;
+ c_23 -= c_22 * tmp;
+ c_33 -= c_32 * tmp;
+ tmp = inv_diag_E[3];
+ c_03 *= tmp;
+ c_13 *= tmp;
+ c_23 *= tmp;
+ c_33 *= tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+ double alpha = 1.0;
+ double beta = 1.0;
+ kernel_dgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+ kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ double alpha = 1.0;
+ double beta = 1.0;
+ kernel_dgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+ kernel_dtrsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ if(kn==1)
+ goto store;
+
+ tmp = E[1+bs*0];
+ c_01 -= c_00 * tmp;
+ c_11 -= c_10 * tmp;
+ c_21 -= c_20 * tmp;
+ c_31 -= c_30 * tmp;
+
+ if(kn==2)
+ goto store;
+
+ tmp = E[2+bs*0];
+ c_02 -= c_00 * tmp;
+ c_12 -= c_10 * tmp;
+ c_22 -= c_20 * tmp;
+ c_32 -= c_30 * tmp;
+ tmp = E[2+bs*1];
+ c_02 -= c_01 * tmp;
+ c_12 -= c_11 * tmp;
+ c_22 -= c_21 * tmp;
+ c_32 -= c_31 * tmp;
+
+ if(kn==3)
+ goto store;
+
+ tmp = E[3+bs*0];
+ c_03 -= c_00 * tmp;
+ c_13 -= c_10 * tmp;
+ c_23 -= c_20 * tmp;
+ c_33 -= c_30 * tmp;
+ tmp = E[3+bs*1];
+ c_03 -= c_01 * tmp;
+ c_13 -= c_11 * tmp;
+ c_23 -= c_21 * tmp;
+ c_33 -= c_31 * tmp;
+ tmp = E[3+bs*2];
+ c_03 -= c_02 * tmp;
+ c_13 -= c_12 * tmp;
+ c_23 -= c_22 * tmp;
+ c_33 -= c_32 * tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_rl_one_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E)
+ {
+ kernel_dtrsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ double
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+
+ if(kn>3)
+ {
+ tmp = inv_diag_E[3];
+ c_03 *= tmp;
+ c_13 *= tmp;
+ c_23 *= tmp;
+ c_33 *= tmp;
+ tmp = E[2+bs*3];
+ c_02 -= c_03 * tmp;
+ c_12 -= c_13 * tmp;
+ c_22 -= c_23 * tmp;
+ c_32 -= c_33 * tmp;
+ tmp = E[1+bs*3];
+ c_01 -= c_03 * tmp;
+ c_11 -= c_13 * tmp;
+ c_21 -= c_23 * tmp;
+ c_31 -= c_33 * tmp;
+ tmp = E[0+bs*3];
+ c_00 -= c_03 * tmp;
+ c_10 -= c_13 * tmp;
+ c_20 -= c_23 * tmp;
+ c_30 -= c_33 * tmp;
+ }
+
+ if(kn>2)
+ {
+ tmp = inv_diag_E[2];
+ c_02 *= tmp;
+ c_12 *= tmp;
+ c_22 *= tmp;
+ c_32 *= tmp;
+ tmp = E[1+bs*2];
+ c_01 -= c_02 * tmp;
+ c_11 -= c_12 * tmp;
+ c_21 -= c_22 * tmp;
+ c_31 -= c_32 * tmp;
+ tmp = E[0+bs*2];
+ c_00 -= c_02 * tmp;
+ c_10 -= c_12 * tmp;
+ c_20 -= c_22 * tmp;
+ c_30 -= c_32 * tmp;
+ }
+
+ if(kn>1)
+ {
+ tmp = inv_diag_E[1];
+ c_01 *= tmp;
+ c_11 *= tmp;
+ c_21 *= tmp;
+ c_31 *= tmp;
+ tmp = E[0+bs*1];
+ c_00 -= c_01 * tmp;
+ c_10 -= c_11 * tmp;
+ c_20 -= c_21 * tmp;
+ c_30 -= c_31 * tmp;
+ }
+
+ tmp = inv_diag_E[0];
+ c_00 *= tmp;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nt_ru_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgetrf_nn_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // factorization
+
+ // first column
+ tmp = 1.0 / c_00;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+ inv_diag_D[0] = tmp;
+
+ if(kn==1)
+ goto store;
+
+ // second column
+ c_11 -= c_10 * c_01;
+ c_21 -= c_20 * c_01;
+ c_31 -= c_30 * c_01;
+
+ tmp = 1.0 / c_11;
+ c_21 *= tmp;
+ c_31 *= tmp;
+
+ inv_diag_D[1] = tmp;
+
+ if(kn==2)
+ goto store;
+
+ // third column
+ c_12 -= c_10 * c_02;
+ c_22 -= c_20 * c_02;
+ c_32 -= c_30 * c_02;
+
+ c_22 -= c_21 * c_12;
+ c_32 -= c_31 * c_12;
+
+ tmp = 1.0 / c_22;
+ c_32 *= tmp;
+
+ inv_diag_D[2] = tmp;
+
+ if(kn==3)
+ goto store;
+
+ // fourth column
+ c_13 -= c_10 * c_03;
+ c_23 -= c_20 * c_03;
+ c_33 -= c_30 * c_03;
+
+ c_23 -= c_21 * c_13;
+ c_33 -= c_31 * c_13;
+
+ c_33 -= c_32 * c_23;
+
+ tmp = 1.0 / c_33;
+
+ inv_diag_D[3] = tmp;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgetrf_nn_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D)
+ {
+ kernel_dgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_1, e_2, e_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // solution
+
+ if(km==1)
+ goto store;
+
+ e_1 = E[1+bs*0];
+ e_2 = E[2+bs*0];
+ e_3 = E[3+bs*0];
+ c_10 -= e_1 * c_00;
+ c_20 -= e_2 * c_00;
+ c_30 -= e_3 * c_00;
+ c_11 -= e_1 * c_01;
+ c_21 -= e_2 * c_01;
+ c_31 -= e_3 * c_01;
+ c_12 -= e_1 * c_02;
+ c_22 -= e_2 * c_02;
+ c_32 -= e_3 * c_02;
+ c_13 -= e_1 * c_03;
+ c_23 -= e_2 * c_03;
+ c_33 -= e_3 * c_03;
+
+ if(km==2)
+ goto store;
+
+ e_2 = E[2+bs*1];
+ e_3 = E[3+bs*1];
+ c_20 -= e_2 * c_10;
+ c_30 -= e_3 * c_10;
+ c_21 -= e_2 * c_11;
+ c_31 -= e_3 * c_11;
+ c_22 -= e_2 * c_12;
+ c_32 -= e_3 * c_12;
+ c_23 -= e_2 * c_13;
+ c_33 -= e_3 * c_13;
+
+ if(km==3)
+ goto store;
+
+ e_3 = E[3+bs*2];
+ c_30 -= e_3 * c_20;
+ c_31 -= e_3 * c_21;
+ c_32 -= e_3 * c_22;
+ c_33 -= e_3 * c_23;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ll_one_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E)
+ {
+ kernel_dtrsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_00, e_01, e_02, e_03,
+ e_11, e_12, e_13,
+ e_22, e_23,
+ e_33,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // solve
+
+ e_00 = inv_diag_E[0];
+ c_00 *= e_00;
+ c_10 *= e_00;
+ c_20 *= e_00;
+ c_30 *= e_00;
+
+ if(kn==1)
+ goto store;
+
+ e_01 = E[0+bs*1];
+ e_11 = inv_diag_E[1];
+ c_01 -= c_00 * e_01;
+ c_11 -= c_10 * e_01;
+ c_21 -= c_20 * e_01;
+ c_31 -= c_30 * e_01;
+ c_01 *= e_11;
+ c_11 *= e_11;
+ c_21 *= e_11;
+ c_31 *= e_11;
+
+ if(kn==2)
+ goto store;
+
+ e_02 = E[0+bs*2];
+ e_12 = E[1+bs*2];
+ e_22 = inv_diag_E[2];
+ c_02 -= c_00 * e_02;
+ c_12 -= c_10 * e_02;
+ c_22 -= c_20 * e_02;
+ c_32 -= c_30 * e_02;
+ c_02 -= c_01 * e_12;
+ c_12 -= c_11 * e_12;
+ c_22 -= c_21 * e_12;
+ c_32 -= c_31 * e_12;
+ c_02 *= e_22;
+ c_12 *= e_22;
+ c_22 *= e_22;
+ c_32 *= e_22;
+
+ if(kn==3)
+ goto store;
+
+ e_03 = E[0+bs*3];
+ e_13 = E[1+bs*3];
+ e_23 = E[2+bs*3];
+ e_33 = inv_diag_E[3];
+ c_03 -= c_00 * e_03;
+ c_13 -= c_10 * e_03;
+ c_23 -= c_20 * e_03;
+ c_33 -= c_30 * e_03;
+ c_03 -= c_01 * e_13;
+ c_13 -= c_11 * e_13;
+ c_23 -= c_21 * e_13;
+ c_33 -= c_31 * e_13;
+ c_03 -= c_02 * e_23;
+ c_13 -= c_12 * e_23;
+ c_23 -= c_22 * e_23;
+ c_33 -= c_32 * e_23;
+ c_03 *= e_33;
+ c_13 *= e_33;
+ c_23 *= e_33;
+ c_33 *= e_33;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_ru_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_00, e_01, e_02, e_03,
+ e_11, e_12, e_13,
+ e_22, e_23,
+ e_33,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+// printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
+// printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
+// printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
+// printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
+
+ // solve
+
+ if(km>3)
+ {
+ e_03 = E[0+bs*3];
+ e_13 = E[1+bs*3];
+ e_23 = E[2+bs*3];
+ e_33 = inv_diag_E[3];
+ c_30 *= e_33;
+ c_31 *= e_33;
+ c_32 *= e_33;
+ c_33 *= e_33;
+ c_00 -= e_03 * c_30;
+ c_01 -= e_03 * c_31;
+ c_02 -= e_03 * c_32;
+ c_03 -= e_03 * c_33;
+ c_10 -= e_13 * c_30;
+ c_11 -= e_13 * c_31;
+ c_12 -= e_13 * c_32;
+ c_13 -= e_13 * c_33;
+ c_20 -= e_23 * c_30;
+ c_21 -= e_23 * c_31;
+ c_22 -= e_23 * c_32;
+ c_23 -= e_23 * c_33;
+ }
+
+ if(km>2)
+ {
+ e_02 = E[0+bs*2];
+ e_12 = E[1+bs*2];
+ e_22 = inv_diag_E[2];
+ c_20 *= e_22;
+ c_21 *= e_22;
+ c_22 *= e_22;
+ c_23 *= e_22;
+ c_00 -= e_02 * c_20;
+ c_01 -= e_02 * c_21;
+ c_02 -= e_02 * c_22;
+ c_03 -= e_02 * c_23;
+ c_10 -= e_12 * c_20;
+ c_11 -= e_12 * c_21;
+ c_12 -= e_12 * c_22;
+ c_13 -= e_12 * c_23;
+ }
+
+ if(km>1)
+ {
+ e_01 = E[0+bs*1];
+ e_11 = inv_diag_E[1];
+ c_10 *= e_11;
+ c_11 *= e_11;
+ c_12 *= e_11;
+ c_13 *= e_11;
+ c_00 -= e_01 * c_10;
+ c_01 -= e_01 * c_11;
+ c_02 -= e_01 * c_12;
+ c_03 -= e_01 * c_13;
+ }
+
+ e_00 = inv_diag_E[0];
+ c_00 *= e_00;
+ c_01 *= e_00;
+ c_02 *= e_00;
+ c_03 *= e_00;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsm_nn_lu_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
+ {
+ kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
diff --git a/kernel/c99/kernel_dgemm_diag_lib4.c b/kernel/c99/kernel_dgemm_diag_lib4.c
new file mode 100644
index 0000000..cad2b21
--- /dev/null
+++ b/kernel/c99/kernel_dgemm_diag_lib4.c
@@ -0,0 +1,1111 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// B is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_4_a0_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+ b_3 = alpha0 * B[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_0;
+ c_2 = a_2 * b_0;
+ c_3 = a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = a_0 * b_1;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_1;
+ c_3 = a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = a_0 * b_2;
+ c_1 = a_1 * b_2;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ c_0 = a_0 * b_3;
+ c_1 = a_1 * b_3;
+ c_2 = a_2 * b_3;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ A += 4*sda;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ a_0 = A[0+bs*3];
+
+ c_0 = a_0 * b_3;
+
+ D[0+bs*3] = c_0;
+
+
+ A += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_4_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+ b_3 = alpha0 * B[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_3;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_3;
+ c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ a_0 = A[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+
+ D[0+bs*3] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_3_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_2_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_right_1_lib4(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_4_a0_lib4(int kmax, double *alpha, double *A, double *B, double *D, int alg)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+ a_3 = alpha0 * A[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+ b_3 = B[3+bs*1];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+ b_3 = B[3+bs*2];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+ b_3 = B[3+bs*3];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ B += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+ B += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int alg)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+ a_3 = alpha0 * A[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+ b_3 = B[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_3;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+ b_3 = B[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_3;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+ b_3 = B[3+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_3_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1, a_2,
+ b_0, b_1, b_2,
+ c_0, c_1, c_2;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_2_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0, a_1,
+ b_0, b_1,
+ c_0, c_1;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemm_diag_left_1_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ alpha0, beta0,
+ a_0,
+ b_0,
+ c_0;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ b_0 = B[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+ D[0+bs*1] = c_0;
+
+
+ b_0 = B[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+ D[0+bs*2] = c_0;
+
+
+ b_0 = B[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+ D[0+bs*3] = c_0;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
diff --git a/kernel/c99/kernel_dgemv_4_lib4.c b/kernel/c99/kernel_dgemv_4_lib4.c
new file mode 100644
index 0000000..9f11b5f
--- /dev/null
+++ b/kernel/c99/kernel_dgemv_4_lib4.c
@@ -0,0 +1,1009 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_gen_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int k1)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ x_0,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ x_0 = x[1];
+
+ y_0 += A[0+bs*1] * x_0;
+ y_1 += A[1+bs*1] * x_0;
+ y_2 += A[2+bs*1] * x_0;
+ y_3 += A[3+bs*1] * x_0;
+
+ x_0 = x[2];
+
+ y_0 += A[0+bs*2] * x_0;
+ y_1 += A[1+bs*2] * x_0;
+ y_2 += A[2+bs*2] * x_0;
+ y_3 += A[3+bs*2] * x_0;
+
+ x_0 = x[3];
+
+ y_0 += A[0+bs*3] * x_0;
+ y_1 += A[1+bs*3] * x_0;
+ y_2 += A[2+bs*3] * x_0;
+ y_3 += A[3+bs*3] * x_0;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ A += 1*bs;
+ x += 1;
+
+ }
+
+ y_0 = alpha[0]*y_0 + beta[0]*y[0];
+ y_1 = alpha[0]*y_1 + beta[0]*y[1];
+ y_2 = alpha[0]*y_2 + beta[0]*y[2];
+ y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+ if(k0<=0 & k1>3)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ if(k0<=0 & k1>0) z[0] = y_0;
+ if(k0<=1 & k1>1) z[1] = y_1;
+ if(k0<=2 & k1>2) z[2] = y_2;
+ if(k0<=3 & k1>3) z[3] = y_3;
+ }
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_vs_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1)
+ {
+
+ kernel_dgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, k1);
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_n_4_lib4(int kmax, double *alpha, double *A, double *x, double *beta, double *y, double *z)
+ {
+
+ kernel_dgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, 4);
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_gen_lib4(int kmax, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km)
+ {
+
+ const int bs = 4;
+
+ int k, kend;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ if(offA!=0) // 1, 2, 3
+ {
+ kend = 4-offA<kmax ? 4-offA : kmax;
+ for(; k<kend; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ A += 1;
+ x += 1;
+
+ }
+ A += bs*(sda-1);
+ }
+ for(; k<kmax-bs+1; k+=bs)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ y_0 += A[1+bs*0] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+ y_0 += A[2+bs*0] * x_2;
+ y_1 += A[2+bs*1] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+ y_0 += A[3+bs*0] * x_3;
+ y_1 += A[3+bs*1] * x_3;
+ y_2 += A[3+bs*2] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ A += 1;
+ x += 1;
+
+ }
+
+ y_0 = alpha[0]*y_0 + beta[0]*y[0];
+ y_1 = alpha[0]*y_1 + beta[0]*y[1];
+ y_2 = alpha[0]*y_2 + beta[0]*y[2];
+ y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+ if(km>=4)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ z[0] = y_0;
+ if(km>=2)
+ {
+ z[1] = y_1;
+ if(km>2)
+ {
+ z[2] = y_2;
+ }
+ }
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_lib4(int kmax, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z)
+ {
+
+ kernel_dgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, 4);
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dgemv_t_4_vs_lib4(int kmax, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z, int k1)
+ {
+
+ kernel_dgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, k1);
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_ln_inv_4_vs_lib4(int kmax, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[1+bs*0] * x_0;
+ y_2 -= A[2+bs*0] * x_0;
+ y_3 -= A[3+bs*0] * x_0;
+
+ y_0 -= A[0+bs*1] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[2+bs*1] * x_1;
+ y_3 -= A[3+bs*1] * x_1;
+
+ y_0 -= A[0+bs*2] * x_2;
+ y_1 -= A[1+bs*2] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+ y_3 -= A[3+bs*2] * x_2;
+
+ y_0 -= A[0+bs*3] * x_3;
+ y_1 -= A[1+bs*3] * x_3;
+ y_2 -= A[2+bs*3] * x_3;
+ y_3 -= A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+ y_3 = y[3] + y_3;
+
+ double
+ a_00, a_10, a_20, a_30,
+ a_11, a_21, a_31;
+
+ // a_00
+ a_00 = inv_diag_A[0];
+ a_10 = A[1+bs*0];
+ a_20 = A[2+bs*0];
+ a_30 = A[3+bs*0];
+ y_0 *= a_00;
+ z[0] = y_0;
+ y_1 -= a_10 * y_0;
+ y_2 -= a_20 * y_0;
+ y_3 -= a_30 * y_0;
+
+ if(kn==1)
+ {
+ if(km==1)
+ return;
+ y[1] = y_1;
+ if(km==2)
+ return;
+ y[2] = y_2;
+ if(km==3)
+ return;
+ y[3] = y_3;
+ return;
+ }
+
+ // a_11
+ a_11 = inv_diag_A[1];
+ a_21 = A[2+bs*1];
+ a_31 = A[3+bs*1];
+ y_1 *= a_11;
+ z[1] = y_1;
+ y_2 -= a_21 * y_1;
+ y_3 -= a_31 * y_1;
+
+ if(kn==2)
+ {
+ if(km==2)
+ return;
+ y[2] = y_2;
+ if(km==3)
+ return;
+ y[3] = y_3;
+ return;
+ }
+
+ // a_22
+ a_00 = inv_diag_A[2];
+ a_10 = A[3+bs*2];
+ y_2 *= a_00;
+ z[2] = y_2;
+ y_3 -= a_10 * y_2;
+
+ if(kn==3)
+ {
+ if(km==3)
+ return;
+ y[3] = y_3;
+
+ return;
+ }
+
+ // a_33
+ a_11 = inv_diag_A[3];
+ y_3 *= a_11;
+ z[3] = y_3;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_ln_inv_4_lib4(int kmax, double *A, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ kernel_dtrsv_ln_inv_4_vs_lib4(kmax, A, inv_diag_A, x, y, z, 4, 4);
+
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_4_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double *tA, *tx;
+ tA = A;
+ tx = x;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+ y_3 -= A[0+bs*3] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[1+bs*2] * x_1;
+ y_3 -= A[1+bs*3] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+ y_3 -= A[2+bs*3] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+ y_3 -= A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+ y_3 -= A[0+bs*3] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+ y_3 = y[3] + y_3;
+
+ A = tA;
+ x = tx;
+
+ // bottom trinagle
+ y_3 *= inv_diag_A[3];
+ z[3] = y_3;
+
+ y_2 -= A[3+bs*2] * y_3;
+ y_2 *= inv_diag_A[2];
+ z[2] = y_2;
+
+ // square
+ y_0 -= A[2+bs*0]*y_2 + A[3+bs*0]*y_3;
+ y_1 -= A[2+bs*1]*y_2 + A[3+bs*1]*y_3;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_3_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double *tA, *tx;
+ tA = A;
+ tx = x;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0;
+
+ k = 3;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_3 = x[3];
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[1+bs*2] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 3;
+ x += 1;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+
+ A = tA;
+ x = tx;
+
+ // bottom trinagle
+ y_2 *= inv_diag_A[2];
+ z[2] = y_2;
+
+ // square
+ y_0 -= A[2+bs*0]*y_2;
+ y_1 -= A[2+bs*1]*y_2;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_2_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double *tA, *tx;
+ tA = A;
+ tx = x;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0;
+
+ k = 2;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 2;
+ x += 2;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+
+ A = tA;
+ x = tx;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrsv_lt_inv_1_lib4(int kmax, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double *tA, *tx;
+ tA = A;
+ tx = x;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0;
+
+ k = 1;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_0 -= A[2+bs*0] * x_2;
+ y_0 -= A[3+bs*0] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_0 -= A[1+bs*0] * x_1;
+ y_0 -= A[2+bs*0] * x_2;
+ y_0 -= A[3+bs*0] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 1;
+ x += 1;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+
+ A = tA;
+ x = tx;
+
+ // top trinagle
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_un_4_lib4(int kmax, double *A, double *x, double *z)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+/* y_1 += A[1+bs*0] * x_0;*/
+/* y_2 += A[2+bs*0] * x_0;*/
+/* y_3 += A[3+bs*0] * x_0;*/
+
+ y_0 += A[0+bs*1] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+/* y_2 += A[2+bs*1] * x_1;*/
+/* y_3 += A[3+bs*1] * x_1;*/
+
+ y_0 += A[0+bs*2] * x_2;
+ y_1 += A[1+bs*2] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+/* y_3 += A[3+bs*2] * x_2;*/
+
+ y_0 += A[0+bs*3] * x_3;
+ y_1 += A[1+bs*3] * x_3;
+ y_2 += A[2+bs*3] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ k=4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ y_0 += A[0+bs*1] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[2+bs*1] * x_1;
+ y_3 += A[3+bs*1] * x_1;
+
+ y_0 += A[0+bs*2] * x_2;
+ y_1 += A[1+bs*2] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[3+bs*2] * x_2;
+
+ y_0 += A[0+bs*3] * x_3;
+ y_1 += A[1+bs*3] * x_3;
+ y_2 += A[2+bs*3] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ A += 1*bs;
+ x += 1;
+
+ }
+
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_ut_4_vs_lib4(int kmax, double *A, int sda, double *x, double *z, int km)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ double
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-4; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ y_0 += A[1+bs*0] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+ y_0 += A[2+bs*0] * x_2;
+ y_1 += A[2+bs*1] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+ y_0 += A[3+bs*0] * x_3;
+ y_1 += A[3+bs*1] * x_3;
+ y_2 += A[3+bs*2] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+/* y_0 += A[1+bs*0] * x_1;*/
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+/* y_0 += A[2+bs*0] * x_2;*/
+/* y_1 += A[2+bs*1] * x_2;*/
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+/* y_0 += A[3+bs*0] * x_3;*/
+/* y_1 += A[3+bs*1] * x_3;*/
+/* y_2 += A[3+bs*2] * x_3;*/
+ y_3 += A[3+bs*3] * x_3;
+
+// A += sda*bs;
+// x += 4;
+
+ // store_vs
+ store:
+ if(km>=4)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ z[0] = y_0;
+ if(km>=2)
+ {
+ z[1] = y_1;
+ if(km>2)
+ {
+ z[2] = y_2;
+ }
+ }
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_dtrmv_ut_4_lib4(int kmax, double *A, int sda, double *x, double *z)
+ {
+
+ kernel_dtrmv_ut_4_vs_lib4(kmax, A, sda, x, z, 4);
+
+ }
+#endif
+
+
+
+
+
diff --git a/kernel/c99/kernel_dgeqrf_4_lib4.c b/kernel/c99/kernel_dgeqrf_4_lib4.c
new file mode 100644
index 0000000..071ec86
--- /dev/null
+++ b/kernel/c99/kernel_dgeqrf_4_lib4.c
@@ -0,0 +1,2620 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+void kernel_dgeqrf_4_lib4(int m, double *pD, int sdd, double *dD)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w1, w2, w3;
+ const int ps = 4;
+ // first column
+ beta = 0.0;
+ ii = 1;
+ if(m>1)
+ {
+ tmp = pD[1+ps*0];
+ beta += tmp*tmp;
+ if(m>2)
+ {
+ tmp = pD[2+ps*0];
+ beta += tmp*tmp;
+ if(m>3)
+ {
+ tmp = pD[3+ps*0];
+ beta += tmp*tmp;
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*0];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[0] = 0.0;
+ }
+ else
+ {
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[0] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[0+ps*0] = beta;
+ ii = 1;
+ if(m>1)
+ {
+ pD[1+ps*0] *= tmp;
+ if(m>2)
+ {
+ pD[2+ps*0] *= tmp;
+ if(m>3)
+ {
+ pD[3+ps*0] *= tmp;
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*0] *= tmp;
+ pD[1+ii*sdd+ps*0] *= tmp;
+ pD[2+ii*sdd+ps*0] *= tmp;
+ pD[3+ii*sdd+ps*0] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*0] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w1 = pD[0+ps*1];
+ w2 = pD[0+ps*2];
+ w3 = pD[0+ps*3];
+ if(m>1)
+ {
+ w1 += pD[1+ps*1] * pD[1+ps*0];
+ w2 += pD[1+ps*2] * pD[1+ps*0];
+ w3 += pD[1+ps*3] * pD[1+ps*0];
+ if(m>2)
+ {
+ w1 += pD[2+ps*1] * pD[2+ps*0];
+ w2 += pD[2+ps*2] * pD[2+ps*0];
+ w3 += pD[2+ps*3] * pD[2+ps*0];
+ if(m>3)
+ {
+ w1 += pD[3+ps*1] * pD[3+ps*0];
+ w2 += pD[3+ps*2] * pD[3+ps*0];
+ w3 += pD[3+ps*3] * pD[3+ps*0];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w1 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ w1 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ w1 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ w1 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w1 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ }
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ pD[0+ps*1] += w1;
+ pD[0+ps*2] += w2;
+ pD[0+ps*3] += w3;
+ if(m>1)
+ {
+ pD[1+ps*1] += w1 * pD[1+ps*0];
+ pD[1+ps*2] += w2 * pD[1+ps*0];
+ pD[1+ps*3] += w3 * pD[1+ps*0];
+ if(m>2)
+ {
+ pD[2+ps*1] += w1 * pD[2+ps*0];
+ pD[2+ps*2] += w2 * pD[2+ps*0];
+ pD[2+ps*3] += w3 * pD[2+ps*0];
+ if(m>3)
+ {
+ pD[3+ps*1] += w1 * pD[3+ps*0];
+ pD[3+ps*2] += w2 * pD[3+ps*0];
+ pD[3+ps*3] += w3 * pD[3+ps*0];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*1] += w1 * pD[0+ii*sdd+ps*0];
+ pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*0];
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*1] += w1 * pD[1+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*0];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*1] += w1 * pD[2+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*0];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*1] += w1 * pD[3+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*0];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*0];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*1] += w1 * pD[ll+ii*sdd+ps*0];
+ pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*0];
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*0];
+ }
+ if(m==1)
+ return;
+ // second column
+ beta = 0.0;
+ if(m>2)
+ {
+ tmp = pD[2+ps*1];
+ beta += tmp*tmp;
+ if(m>3)
+ {
+ tmp = pD[3+ps*1];
+ beta += tmp*tmp;
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*1];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[1] = 0.0;
+ }
+ else
+ {
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[1] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[1+ps*1] = beta;
+ if(m>2)
+ {
+ pD[2+ps*1] *= tmp;
+ if(m>3)
+ {
+ pD[3+ps*1] *= tmp;
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*1] *= tmp;
+ pD[1+ii*sdd+ps*1] *= tmp;
+ pD[2+ii*sdd+ps*1] *= tmp;
+ pD[3+ii*sdd+ps*1] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*1] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w2 = pD[1+ps*2];
+ w3 = pD[1+ps*3];
+ if(m>2)
+ {
+ w2 += pD[2+ps*2] * pD[2+ps*1];
+ w3 += pD[2+ps*3] * pD[2+ps*1];
+ if(m>3)
+ {
+ w2 += pD[3+ps*2] * pD[3+ps*1];
+ w3 += pD[3+ps*3] * pD[3+ps*1];
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w2 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ w2 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ w2 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ w2 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w2 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ }
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ pD[1+ps*2] += w2;
+ pD[1+ps*3] += w3;
+ if(m>2)
+ {
+ pD[2+ps*2] += w2 * pD[2+ps*1];
+ pD[2+ps*3] += w3 * pD[2+ps*1];
+ if(m>3)
+ {
+ pD[3+ps*2] += w2 * pD[3+ps*1];
+ pD[3+ps*3] += w3 * pD[3+ps*1];
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*2] += w2 * pD[0+ii*sdd+ps*1];
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*1];
+ pD[1+ii*sdd+ps*2] += w2 * pD[1+ii*sdd+ps*1];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*1];
+ pD[2+ii*sdd+ps*2] += w2 * pD[2+ii*sdd+ps*1];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*1];
+ pD[3+ii*sdd+ps*2] += w2 * pD[3+ii*sdd+ps*1];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*1];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*2] += w2 * pD[ll+ii*sdd+ps*1];
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*1];
+ }
+ if(m==2)
+ return;
+ // third column
+ beta = 0.0;
+ if(m>3)
+ {
+ tmp = pD[3+ps*2];
+ beta += tmp*tmp;
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*2];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[2] = 0.0;
+ }
+ else
+ {
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[2] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[2+ps*2] = beta;
+ if(m>3)
+ {
+ pD[3+ps*2] *= tmp;
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*2] *= tmp;
+ pD[1+ii*sdd+ps*2] *= tmp;
+ pD[2+ii*sdd+ps*2] *= tmp;
+ pD[3+ii*sdd+ps*2] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*2] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w3 = pD[2+ps*3];
+ if(m>3)
+ {
+ w3 += pD[3+ps*3] * pD[3+ps*2];
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ w3 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ w3 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ w3 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ w3 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ w3 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ w3 = - dD[2] * w3;
+ pD[2+ps*3] += w3;
+ if(m>3)
+ {
+ pD[3+ps*3] += w3 * pD[3+ps*2];
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*3] += w3 * pD[0+ii*sdd+ps*2];
+ pD[1+ii*sdd+ps*3] += w3 * pD[1+ii*sdd+ps*2];
+ pD[2+ii*sdd+ps*3] += w3 * pD[2+ii*sdd+ps*2];
+ pD[3+ii*sdd+ps*3] += w3 * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*3] += w3 * pD[ll+ii*sdd+ps*2];
+ }
+ if(m==3)
+ return;
+ // fourth column
+ beta = 0.0;
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ tmp = pD[0+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[1+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[2+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ tmp = pD[3+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ tmp = pD[ll+ii*sdd+ps*3];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[3] = 0.0;
+ }
+ else
+ {
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[3] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[3+ps*3] = beta;
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ pD[0+ii*sdd+ps*3] *= tmp;
+ pD[1+ii*sdd+ps*3] *= tmp;
+ pD[2+ii*sdd+ps*3] *= tmp;
+ pD[3+ii*sdd+ps*3] *= tmp;
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ pD[ll+ii*sdd+ps*3] *= tmp;
+ }
+ }
+ return;
+ }
+
+
+// unblocked algorithm
+void kernel_dgeqrf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+ const int ps = 4;
+ imax = k; //m<n ? m : n;
+ double alpha, beta, tmp, w0;
+ double *pC00, *pC10, *pC01, *pC11;
+ int offset;
+ double *pD0 = pD-offD;
+ for(ii=0; ii<imax; ii++)
+ {
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ jmax = m-ii-1;
+ jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ offset = 0;
+ jj = 0;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ tmp = pC10[1+offset];
+ beta += tmp*tmp;
+ tmp = pC10[2+offset];
+ beta += tmp*tmp;
+ tmp = pC10[3+offset];
+ beta += tmp*tmp;
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ tmp = pC10[0+offset];
+ beta += tmp*tmp;
+ offset += 1;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ offset = 0;
+ jj = 0;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ pC10[0+offset] *= tmp;
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ pC10[0+offset] *= tmp;
+ pC10[1+offset] *= tmp;
+ pC10[2+offset] *= tmp;
+ pC10[3+offset] *= tmp;
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ pC10[0+offset] *= tmp;
+ offset += 1;
+ }
+ pC00[0] = beta;
+ }
+ if(ii<n)
+ {
+ pC01 = pC00 + ps;
+ pC11 = pC10 + ps;
+ kmax = jmax;
+ kmax0 = jmax0;
+ jmax = n-ii-1;
+ jj = 0;
+ for( ; jj<jmax; jj++)
+ {
+ w0 = pC01[0+ps*jj] * 1.0;
+ offset = 0;
+ kk = 0;
+ if(kmax0>0)
+ {
+ for( ; kk<kmax0; kk++)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ offset += 1;
+ }
+ offset += -ps+ps*sdd;
+ }
+ for( ; kk<kmax-3; kk+=4)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ w0 += pC11[1+offset+ps*jj] * pC10[1+offset];
+ w0 += pC11[2+offset+ps*jj] * pC10[2+offset];
+ w0 += pC11[3+offset+ps*jj] * pC10[3+offset];
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<kmax-kk; ll++)
+ {
+ w0 += pC11[0+offset+ps*jj] * pC10[0+offset];
+ offset += 1;
+ }
+ w0 = - dD[ii] * w0;
+ pC01[0+ps*jj] += w0;
+ offset = 0;
+ kk = 0;
+ if(kmax0>0)
+ {
+ for( ; kk<kmax0; kk++)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ offset += 1;
+ }
+ offset = offset-ps+ps*sdd;
+ }
+ for( ; kk<kmax-3; kk+=4)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ pC11[1+offset+ps*jj] += w0 * pC10[1+offset];
+ pC11[2+offset+ps*jj] += w0 * pC10[2+offset];
+ pC11[3+offset+ps*jj] += w0 * pC10[3+offset];
+ offset += ps*sdd;
+ }
+ for(ll=0; ll<kmax-kk; ll++)
+ {
+ pC11[0+offset+ps*jj] += w0 * pC10[0+offset];
+ offset += 1;
+ }
+ }
+ }
+ }
+ return;
+ }
+
+
+
+void kernel_dlarf_4_lib4(int m, int n, double *pD, int sdd, double *dD, double *pC0, int sdc)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, ll;
+ const int ps = 4;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ double tmp, d0, d1, d2, d3;
+ double *pC;
+ double pT[16];// = {};
+ int ldt = 4;
+ double pW[8];// = {};
+ int ldw = 2;
+ // dot product of v
+ v10 = 0.0;
+ v20 = 0.0;
+ v30 = 0.0;
+ v21 = 0.0;
+ v31 = 0.0;
+ v32 = 0.0;
+ if(m>1)
+ {
+ v10 = 1.0 * pD[1+ps*0];
+ if(m>2)
+ {
+ v10 += pD[2+ps*1] * pD[2+ps*0];
+ v20 = 1.0 * pD[2+ps*0];
+ v21 = 1.0 * pD[2+ps*1];
+ if(m>3)
+ {
+ v10 += pD[3+ps*1] * pD[3+ps*0];
+ v20 += pD[3+ps*2] * pD[3+ps*0];
+ v21 += pD[3+ps*2] * pD[3+ps*1];
+ v30 = 1.0 * pD[3+ps*0];
+ v31 = 1.0 * pD[3+ps*1];
+ v32 = 1.0 * pD[3+ps*2];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ // compute lower triangular T containing tau for matrix update
+ pT[0+ldt*0] = dD[0];
+ pT[1+ldt*1] = dD[1];
+ pT[2+ldt*2] = dD[2];
+ pT[3+ldt*3] = dD[3];
+ pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+ pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+ pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+ pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+ pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+ pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+ // downgrade matrix
+ pW[0] = 0.0;
+ pW[1] = 0.0;
+ pW[2] = 0.0;
+ pW[3] = 0.0;
+ pW[4] = 0.0;
+ pW[5] = 0.0;
+ pW[6] = 0.0;
+ pW[7] = 0.0;
+ ii = 0;
+ for( ; ii<n-1; ii+=2)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ tmp = pC[0+ps*1];
+ pW[1+ldw*0] = tmp;
+ if(m>1)
+ {
+ d0 = pD[1+ps*0];
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] = tmp;
+ tmp = pC[1+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] = tmp;
+ if(m>2)
+ {
+ d0 = pD[2+ps*0];
+ d1 = pD[2+ps*1];
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] = tmp;
+ tmp = pC[2+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] = tmp;
+ if(m>3)
+ {
+ d0 = pD[3+ps*0];
+ d1 = pD[3+ps*1];
+ d2 = pD[3+ps*2];
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] = tmp;
+ tmp = pC[3+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pD[0+jj*sdd+ps*0];
+ d1 = pD[0+jj*sdd+ps*1];
+ d2 = pD[0+jj*sdd+ps*2];
+ d3 = pD[0+jj*sdd+ps*3];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[0+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[1+jj*sdd+ps*0];
+ d1 = pD[1+jj*sdd+ps*1];
+ d2 = pD[1+jj*sdd+ps*2];
+ d3 = pD[1+jj*sdd+ps*3];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[1+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[2+jj*sdd+ps*0];
+ d1 = pD[2+jj*sdd+ps*1];
+ d2 = pD[2+jj*sdd+ps*2];
+ d3 = pD[2+jj*sdd+ps*3];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[2+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ //
+ d0 = pD[3+jj*sdd+ps*0];
+ d1 = pD[3+jj*sdd+ps*1];
+ d2 = pD[3+jj*sdd+ps*2];
+ d3 = pD[3+jj*sdd+ps*3];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[3+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pD[ll+jj*sdd+ps*0];
+ d1 = pD[ll+jj*sdd+ps*1];
+ d2 = pD[ll+jj*sdd+ps*2];
+ d3 = pD[ll+jj*sdd+ps*3];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * d0;
+ pW[0+ldw*1] += tmp * d1;
+ pW[0+ldw*2] += tmp * d2;
+ pW[0+ldw*3] += tmp * d3;
+ tmp = pC[ll+jj*sdc+ps*1];
+ pW[1+ldw*0] += tmp * d0;
+ pW[1+ldw*1] += tmp * d1;
+ pW[1+ldw*2] += tmp * d2;
+ pW[1+ldw*3] += tmp * d3;
+ }
+ // compute W^T *= T
+ pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+ pW[1+ldw*3] = pT[3+ldt*0]*pW[1+ldw*0] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[1+ldw*2] + pT[3+ldt*3]*pW[1+ldw*3];
+ pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+ pW[1+ldw*2] = pT[2+ldt*0]*pW[1+ldw*0] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[1+ldw*2];
+ pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+ pW[1+ldw*1] = pT[1+ldt*0]*pW[1+ldw*0] + pT[1+ldt*1]*pW[1+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ pW[1+ldw*0] = pT[0+ldt*0]*pW[1+ldw*0];
+ // compute C -= V * W^T
+ pC[0+ps*0] -= pW[0+ldw*0];
+ pC[0+ps*1] -= pW[1+ldw*0];
+ if(m>1)
+ {
+ pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+ pC[1+ps*1] -= pD[1+ps*0]*pW[1+ldw*0] + pW[1+ldw*1];
+ if(m>2)
+ {
+ pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+ pC[2+ps*1] -= pD[2+ps*0]*pW[1+ldw*0] + pD[2+ps*1]*pW[1+ldw*1] + pW[1+ldw*2];
+ if(m>3)
+ {
+ pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+ pC[3+ps*1] -= pD[3+ps*0]*pW[1+ldw*0] + pD[3+ps*1]*pW[1+ldw*1] + pD[3+ps*2]*pW[1+ldw*2] + pW[1+ldw*3];
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pD[0+jj*sdd+ps*0];
+ d1 = pD[0+jj*sdd+ps*1];
+ d2 = pD[0+jj*sdd+ps*2];
+ d3 = pD[0+jj*sdd+ps*3];
+ pC[0+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[0+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[1+jj*sdd+ps*0];
+ d1 = pD[1+jj*sdd+ps*1];
+ d2 = pD[1+jj*sdd+ps*2];
+ d3 = pD[1+jj*sdd+ps*3];
+ pC[1+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[1+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[2+jj*sdd+ps*0];
+ d1 = pD[2+jj*sdd+ps*1];
+ d2 = pD[2+jj*sdd+ps*2];
+ d3 = pD[2+jj*sdd+ps*3];
+ pC[2+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[2+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ //
+ d0 = pD[3+jj*sdd+ps*0];
+ d1 = pD[3+jj*sdd+ps*1];
+ d2 = pD[3+jj*sdd+ps*2];
+ d3 = pD[3+jj*sdd+ps*3];
+ pC[3+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[3+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pD[ll+jj*sdd+ps*0];
+ d1 = pD[ll+jj*sdd+ps*1];
+ d2 = pD[ll+jj*sdd+ps*2];
+ d3 = pD[ll+jj*sdd+ps*3];
+ pC[ll+jj*sdc+ps*0] -= d0*pW[0+ldw*0] + d1*pW[0+ldw*1] + d2*pW[0+ldw*2] + d3*pW[0+ldw*3];
+ pC[ll+jj*sdc+ps*1] -= d0*pW[1+ldw*0] + d1*pW[1+ldw*1] + d2*pW[1+ldw*2] + d3*pW[1+ldw*3];
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ if(m>1)
+ {
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += tmp * pD[1+ps*0];
+ pW[0+ldw*1] = tmp;
+ if(m>2)
+ {
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += tmp * pD[2+ps*0];
+ pW[0+ldw*1] += tmp * pD[2+ps*1];
+ pW[0+ldw*2] = tmp;
+ if(m>3)
+ {
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += tmp * pD[3+ps*0];
+ pW[0+ldw*1] += tmp * pD[3+ps*1];
+ pW[0+ldw*2] += tmp * pD[3+ps*2];
+ pW[0+ldw*3] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[0+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[0+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[0+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[0+jj*sdd+ps*3];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[1+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[1+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[1+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[1+jj*sdd+ps*3];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[2+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[2+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[2+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[2+jj*sdd+ps*3];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[3+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[3+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[3+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[3+jj*sdd+ps*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += tmp * pD[ll+jj*sdd+ps*0];
+ pW[0+ldw*1] += tmp * pD[ll+jj*sdd+ps*1];
+ pW[0+ldw*2] += tmp * pD[ll+jj*sdd+ps*2];
+ pW[0+ldw*3] += tmp * pD[ll+jj*sdd+ps*3];
+ }
+ // compute W^T *= T
+ pW[0+ldw*3] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[0+ldw*1] + pT[3+ldt*2]*pW[0+ldw*2] + pT[3+ldt*3]*pW[0+ldw*3];
+ pW[0+ldw*2] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[0+ldw*1] + pT[2+ldt*2]*pW[0+ldw*2];
+ pW[0+ldw*1] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[0+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ // compute C -= V * W^T
+ pC[0+ps*0] -= pW[0+ldw*0];
+ if(m>1)
+ {
+ pC[1+ps*0] -= pD[1+ps*0]*pW[0+ldw*0] + pW[0+ldw*1];
+ if(m>2)
+ {
+ pC[2+ps*0] -= pD[2+ps*0]*pW[0+ldw*0] + pD[2+ps*1]*pW[0+ldw*1] + pW[0+ldw*2];
+ if(m>3)
+ {
+ pC[3+ps*0] -= pD[3+ps*0]*pW[0+ldw*0] + pD[3+ps*1]*pW[0+ldw*1] + pD[3+ps*2]*pW[0+ldw*2] + pW[0+ldw*3];
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ pC[0+jj*sdc+ps*0] -= pD[0+jj*sdd+ps*0]*pW[0+ldw*0] + pD[0+jj*sdd+ps*1]*pW[0+ldw*1] + pD[0+jj*sdd+ps*2]*pW[0+ldw*2] + pD[0+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[1+jj*sdc+ps*0] -= pD[1+jj*sdd+ps*0]*pW[0+ldw*0] + pD[1+jj*sdd+ps*1]*pW[0+ldw*1] + pD[1+jj*sdd+ps*2]*pW[0+ldw*2] + pD[1+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[2+jj*sdc+ps*0] -= pD[2+jj*sdd+ps*0]*pW[0+ldw*0] + pD[2+jj*sdd+ps*1]*pW[0+ldw*1] + pD[2+jj*sdd+ps*2]*pW[0+ldw*2] + pD[2+jj*sdd+ps*3]*pW[0+ldw*3];
+ pC[3+jj*sdc+ps*0] -= pD[3+jj*sdd+ps*0]*pW[0+ldw*0] + pD[3+jj*sdd+ps*1]*pW[0+ldw*1] + pD[3+jj*sdd+ps*2]*pW[0+ldw*2] + pD[3+jj*sdd+ps*3]*pW[0+ldw*3];
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ pC[ll+jj*sdc+ps*0] -= pD[ll+jj*sdd+ps*0]*pW[0+ldw*0] + pD[ll+jj*sdd+ps*1]*pW[0+ldw*1] + pD[ll+jj*sdd+ps*2]*pW[0+ldw*2] + pD[ll+jj*sdd+ps*3]*pW[0+ldw*3];
+ }
+ }
+
+ return;
+ }
+
+
+
+void kernel_dlarf_t_4_lib4(int m, int n, double *pD, int sdd, double *pVt, double *dD, double *pC0, int sdc)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, ll;
+ const int ps = 4;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ double c00, c01,
+ c10, c11,
+ c20, c21,
+ c30, c31;
+ double a0, a1, a2, a3, b0, b1;
+ double tmp, d0, d1, d2, d3;
+ double *pC;
+ double pT[16];// = {};
+ int ldt = 4;
+ double pW[8];// = {};
+ int ldw = 4;
+ // dot product of v
+ v10 = 0.0;
+ v20 = 0.0;
+ v30 = 0.0;
+ v21 = 0.0;
+ v31 = 0.0;
+ v32 = 0.0;
+ if(m>1)
+ {
+ v10 = 1.0 * pD[1+ps*0];
+ if(m>2)
+ {
+ v10 += pD[2+ps*1] * pD[2+ps*0];
+ v20 = 1.0 * pD[2+ps*0];
+ v21 = 1.0 * pD[2+ps*1];
+ if(m>3)
+ {
+ v10 += pD[3+ps*1] * pD[3+ps*0];
+ v20 += pD[3+ps*2] * pD[3+ps*0];
+ v21 += pD[3+ps*2] * pD[3+ps*1];
+ v30 = 1.0 * pD[3+ps*0];
+ v31 = 1.0 * pD[3+ps*1];
+ v32 = 1.0 * pD[3+ps*2];
+ }
+ }
+ }
+ for(ii=4; ii<m-3; ii+=4)
+ {
+ v10 += pD[0+ii*sdd+ps*1] * pD[0+ii*sdd+ps*0];
+ v20 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*0];
+ v21 += pD[0+ii*sdd+ps*2] * pD[0+ii*sdd+ps*1];
+ v30 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*0];
+ v31 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*1];
+ v32 += pD[0+ii*sdd+ps*3] * pD[0+ii*sdd+ps*2];
+ v10 += pD[1+ii*sdd+ps*1] * pD[1+ii*sdd+ps*0];
+ v20 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*0];
+ v21 += pD[1+ii*sdd+ps*2] * pD[1+ii*sdd+ps*1];
+ v30 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*0];
+ v31 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*1];
+ v32 += pD[1+ii*sdd+ps*3] * pD[1+ii*sdd+ps*2];
+ v10 += pD[2+ii*sdd+ps*1] * pD[2+ii*sdd+ps*0];
+ v20 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*0];
+ v21 += pD[2+ii*sdd+ps*2] * pD[2+ii*sdd+ps*1];
+ v30 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*0];
+ v31 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*1];
+ v32 += pD[2+ii*sdd+ps*3] * pD[2+ii*sdd+ps*2];
+ v10 += pD[3+ii*sdd+ps*1] * pD[3+ii*sdd+ps*0];
+ v20 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*0];
+ v21 += pD[3+ii*sdd+ps*2] * pD[3+ii*sdd+ps*1];
+ v30 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*0];
+ v31 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*1];
+ v32 += pD[3+ii*sdd+ps*3] * pD[3+ii*sdd+ps*2];
+ }
+ for(ll=0; ll<m-ii; ll++)
+ {
+ v10 += pD[ll+ii*sdd+ps*1] * pD[ll+ii*sdd+ps*0];
+ v20 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*0];
+ v21 += pD[ll+ii*sdd+ps*2] * pD[ll+ii*sdd+ps*1];
+ v30 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*0];
+ v31 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*1];
+ v32 += pD[ll+ii*sdd+ps*3] * pD[ll+ii*sdd+ps*2];
+ }
+ // compute lower triangular T containing tau for matrix update
+ pT[0+ldt*0] = dD[0];
+ pT[1+ldt*1] = dD[1];
+ pT[2+ldt*2] = dD[2];
+ pT[3+ldt*3] = dD[3];
+ pT[1+ldt*0] = - dD[1] * (v10*pT[0+ldt*0]);
+ pT[2+ldt*1] = - dD[2] * (v21*pT[1+ldt*1]);
+ pT[3+ldt*2] = - dD[3] * (v32*pT[2+ldt*2]);
+ pT[2+ldt*0] = - dD[2] * (v20*pT[0+ldt*0] + v21*pT[1+ldt*0]);
+ pT[3+ldt*1] = - dD[3] * (v31*pT[1+ldt*1] + v32*pT[2+ldt*1]);
+ pT[3+ldt*0] = - dD[3] * (v30*pT[0+ldt*0] + v31*pT[1+ldt*0] + v32*pT[2+ldt*0]);
+ // downgrade matrix
+ pW[0] = 0.0;
+ pW[1] = 0.0;
+ pW[2] = 0.0;
+ pW[3] = 0.0;
+ pW[4] = 0.0;
+ pW[5] = 0.0;
+ pW[6] = 0.0;
+ pW[7] = 0.0;
+ ii = 0;
+ for( ; ii<n-1; ii+=2)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ tmp = pC[0+ps*1];
+ pW[0+ldw*1] = tmp;
+ if(m>1)
+ {
+ d0 = pVt[0+ps*1];
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] = tmp;
+ tmp = pC[1+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] = tmp;
+ if(m>2)
+ {
+ d0 = pVt[0+ps*2];
+ d1 = pVt[1+ps*2];
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] = tmp;
+ tmp = pC[2+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] = tmp;
+ if(m>3)
+ {
+ d0 = pVt[0+ps*3];
+ d1 = pVt[1+ps*3];
+ d2 = pVt[2+ps*3];
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] = tmp;
+ tmp = pC[3+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pVt[0+ps*(0+jj)];
+ d1 = pVt[1+ps*(0+jj)];
+ d2 = pVt[2+ps*(0+jj)];
+ d3 = pVt[3+ps*(0+jj)];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[0+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(1+jj)];
+ d1 = pVt[1+ps*(1+jj)];
+ d2 = pVt[2+ps*(1+jj)];
+ d3 = pVt[3+ps*(1+jj)];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[1+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(2+jj)];
+ d1 = pVt[1+ps*(2+jj)];
+ d2 = pVt[2+ps*(2+jj)];
+ d3 = pVt[3+ps*(2+jj)];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[2+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(3+jj)];
+ d1 = pVt[1+ps*(3+jj)];
+ d2 = pVt[2+ps*(3+jj)];
+ d3 = pVt[3+ps*(3+jj)];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[3+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pVt[0+ps*(ll+jj)];
+ d1 = pVt[1+ps*(ll+jj)];
+ d2 = pVt[2+ps*(ll+jj)];
+ d3 = pVt[3+ps*(ll+jj)];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ tmp = pC[ll+jj*sdc+ps*1];
+ pW[0+ldw*1] += d0 * tmp;
+ pW[1+ldw*1] += d1 * tmp;
+ pW[2+ldw*1] += d2 * tmp;
+ pW[3+ldw*1] += d3 * tmp;
+ }
+ // compute W^T *= T
+ pW[3+ldw*0] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[1+ldw*0] + pT[3+ldt*2]*pW[2+ldw*0] + pT[3+ldt*3]*pW[3+ldw*0];
+ pW[3+ldw*1] = pT[3+ldt*0]*pW[0+ldw*1] + pT[3+ldt*1]*pW[1+ldw*1] + pT[3+ldt*2]*pW[2+ldw*1] + pT[3+ldt*3]*pW[3+ldw*1];
+ pW[2+ldw*0] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[1+ldw*0] + pT[2+ldt*2]*pW[2+ldw*0];
+ pW[2+ldw*1] = pT[2+ldt*0]*pW[0+ldw*1] + pT[2+ldt*1]*pW[1+ldw*1] + pT[2+ldt*2]*pW[2+ldw*1];
+ pW[1+ldw*0] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[1+ldw*0];
+ pW[1+ldw*1] = pT[1+ldt*0]*pW[0+ldw*1] + pT[1+ldt*1]*pW[1+ldw*1];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ pW[0+ldw*1] = pT[0+ldt*0]*pW[0+ldw*1];
+ // compute C -= V * W^T
+ jj = 0;
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ c01 = pC[0+jj*sdc+ps*1];
+ c11 = pC[1+jj*sdc+ps*1];
+ c21 = pC[2+jj*sdc+ps*1];
+ c31 = pC[3+jj*sdc+ps*1];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[0+ldw*1];
+ c01 -= b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[1+ldw*1];
+ c11 -= b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c20 -= b0;
+ c30 -= a3*b0;
+ b1 = pW[2+ldw*1];
+ c21 -= b1;
+ c31 -= a3*b1;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c30 -= b0;
+ b1 = pW[3+ldw*1];
+ c31 -= b1;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[0+jj*sdc+ps*1] = c01;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[1+jj*sdc+ps*1] = c11;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[2+jj*sdc+ps*1] = c21;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*0] = c30;
+ pC[3+jj*sdc+ps*1] = c31;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ c01 = pC[0+jj*sdc+ps*1];
+ c11 = pC[1+jj*sdc+ps*1];
+ c21 = pC[2+jj*sdc+ps*1];
+ c31 = pC[3+jj*sdc+ps*1];
+ //
+ a0 = pD[0+jj*sdd+ps*0];
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[0+ldw*1];
+ c01 -= a0*b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ //
+ a0 = pD[0+jj*sdd+ps*1];
+ a1 = pD[1+jj*sdd+ps*1];
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[1+ldw*1];
+ c01 -= a0*b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ //
+ a0 = pD[0+jj*sdd+ps*2];
+ a1 = pD[1+jj*sdd+ps*2];
+ a2 = pD[2+jj*sdd+ps*2];
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[2+ldw*1];
+ c01 -= a0*b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ //
+ a0 = pD[0+jj*sdd+ps*3];
+ a1 = pD[1+jj*sdd+ps*3];
+ a2 = pD[2+jj*sdd+ps*3];
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ b1 = pW[3+ldw*1];
+ c01 -= a0*b1;
+ c11 -= a1*b1;
+ c21 -= a2*b1;
+ c31 -= a3*b1;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[3+jj*sdc+ps*0] = c30;
+ pC[0+jj*sdc+ps*1] = c01;
+ pC[1+jj*sdc+ps*1] = c11;
+ pC[2+jj*sdc+ps*1] = c21;
+ pC[3+jj*sdc+ps*1] = c31;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ // load
+ c00 = pC[ll+jj*sdc+ps*0];
+ c01 = pC[ll+jj*sdc+ps*1];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= a0*b0;
+ b1 = pW[0+ldw*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c00 -= a0*b0;
+ b1 = pW[1+ldw*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c00 -= a0*b0;
+ b1 = pW[2+ldw*1];
+ c01 -= a0*b1;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c00 -= a0*b0;
+ b1 = pW[3+ldw*1];
+ c01 -= a0*b1;
+ // store
+ pC[ll+jj*sdc+ps*0] = c00;
+ pC[ll+jj*sdc+ps*1] = c01;
+ }
+ }
+ for( ; ii<n; ii++)
+ {
+ pC = pC0+ii*ps;
+ // compute W^T = C^T * V
+ tmp = pC[0+ps*0];
+ pW[0+ldw*0] = tmp;
+ if(m>1)
+ {
+ d0 = pVt[0+ps*1];
+ tmp = pC[1+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] = tmp;
+ if(m>2)
+ {
+ d0 = pVt[0+ps*2];
+ d1 = pVt[1+ps*2];
+ tmp = pC[2+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] = tmp;
+ if(m>3)
+ {
+ d0 = pVt[0+ps*3];
+ d1 = pVt[1+ps*3];
+ d2 = pVt[2+ps*3];
+ tmp = pC[3+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] = tmp;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ //
+ d0 = pVt[0+ps*(0+jj)];
+ d1 = pVt[1+ps*(0+jj)];
+ d2 = pVt[2+ps*(0+jj)];
+ d3 = pVt[3+ps*(0+jj)];
+ tmp = pC[0+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(1+jj)];
+ d1 = pVt[1+ps*(1+jj)];
+ d2 = pVt[2+ps*(1+jj)];
+ d3 = pVt[3+ps*(1+jj)];
+ tmp = pC[1+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(2+jj)];
+ d1 = pVt[1+ps*(2+jj)];
+ d2 = pVt[2+ps*(2+jj)];
+ d3 = pVt[3+ps*(2+jj)];
+ tmp = pC[2+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ //
+ d0 = pVt[0+ps*(3+jj)];
+ d1 = pVt[1+ps*(3+jj)];
+ d2 = pVt[2+ps*(3+jj)];
+ d3 = pVt[3+ps*(3+jj)];
+ tmp = pC[3+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ d0 = pVt[0+ps*(ll+jj)];
+ d1 = pVt[1+ps*(ll+jj)];
+ d2 = pVt[2+ps*(ll+jj)];
+ d3 = pVt[3+ps*(ll+jj)];
+ tmp = pC[ll+jj*sdc+ps*0];
+ pW[0+ldw*0] += d0 * tmp;
+ pW[1+ldw*0] += d1 * tmp;
+ pW[2+ldw*0] += d2 * tmp;
+ pW[3+ldw*0] += d3 * tmp;
+ }
+ // compute W^T *= T
+ pW[3+ldw*0] = pT[3+ldt*0]*pW[0+ldw*0] + pT[3+ldt*1]*pW[1+ldw*0] + pT[3+ldt*2]*pW[2+ldw*0] + pT[3+ldt*3]*pW[3+ldw*0];
+ pW[2+ldw*0] = pT[2+ldt*0]*pW[0+ldw*0] + pT[2+ldt*1]*pW[1+ldw*0] + pT[2+ldt*2]*pW[2+ldw*0];
+ pW[1+ldw*0] = pT[1+ldt*0]*pW[0+ldw*0] + pT[1+ldt*1]*pW[1+ldw*0];
+ pW[0+ldw*0] = pT[0+ldt*0]*pW[0+ldw*0];
+ // compute C -= V * W^T
+ jj = 0;
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ // rank1
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // rank2
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c10 -= b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // rank3
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c20 -= b0;
+ c30 -= a3*b0;
+ // rank4
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c30 -= b0;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ if(m>1)
+ {
+ pC[1+jj*sdc+ps*0] = c10;
+ if(m>2)
+ {
+ pC[2+jj*sdc+ps*0] = c20;
+ if(m>3)
+ {
+ pC[3+jj*sdc+ps*0] = c30;
+ }
+ }
+ }
+ for(jj=4; jj<m-3; jj+=4)
+ {
+ // load
+ c00 = pC[0+jj*sdc+ps*0];
+ c10 = pC[1+jj*sdc+ps*0];
+ c20 = pC[2+jj*sdc+ps*0];
+ c30 = pC[3+jj*sdc+ps*0];
+ //
+ a0 = pD[0+jj*sdd+ps*0];
+ a1 = pD[1+jj*sdd+ps*0];
+ a2 = pD[2+jj*sdd+ps*0];
+ a3 = pD[3+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*1];
+ a1 = pD[1+jj*sdd+ps*1];
+ a2 = pD[2+jj*sdd+ps*1];
+ a3 = pD[3+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*2];
+ a1 = pD[1+jj*sdd+ps*2];
+ a2 = pD[2+jj*sdd+ps*2];
+ a3 = pD[3+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ //
+ a0 = pD[0+jj*sdd+ps*3];
+ a1 = pD[1+jj*sdd+ps*3];
+ a2 = pD[2+jj*sdd+ps*3];
+ a3 = pD[3+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c00 -= a0*b0;
+ c10 -= a1*b0;
+ c20 -= a2*b0;
+ c30 -= a3*b0;
+ // store
+ pC[0+jj*sdc+ps*0] = c00;
+ pC[1+jj*sdc+ps*0] = c10;
+ pC[2+jj*sdc+ps*0] = c20;
+ pC[3+jj*sdc+ps*0] = c30;
+ }
+ for(ll=0; ll<m-jj; ll++)
+ {
+ // load
+ c00 = pC[ll+jj*sdc+ps*0];
+ //
+ a0 = pD[ll+jj*sdd+ps*0];
+ b0 = pW[0+ldw*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*1];
+ b0 = pW[1+ldw*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*2];
+ b0 = pW[2+ldw*0];
+ c00 -= a0*b0;
+ //
+ a0 = pD[ll+jj*sdd+ps*3];
+ b0 = pW[3+ldw*0];
+ c00 -= a0*b0;
+ // store
+ pC[ll+jj*sdc+ps*0] = c00;
+ }
+ }
+
+ return;
+ }
+
+
+
+// assume n>=4
+void kernel_dgelqf_4_lib4(int n, double *pD, double *dD)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w1, w2, w3;
+ const int ps = 4;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[0] = 0.0;
+ }
+ else
+ {
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[0] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[0+ps*0] = beta;
+ for(ii=1; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ }
+ // second column
+ beta = 0.0;
+ for(ii=2; ii<n; ii++)
+ {
+ tmp = pD[1+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[1] = 0.0;
+ }
+ else
+ {
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[1] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[1+ps*1] = beta;
+ for(ii=2; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ }
+ // third column
+ beta = 0.0;
+ for(ii=3; ii<n; ii++)
+ {
+ tmp = pD[2+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[2] = 0.0;
+ }
+ else
+ {
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[2] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[2+ps*2] = beta;
+ for(ii=3; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ }
+ }
+ // gemv_t & ger
+ w3 = pD[3+ps*2];
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ w3 = - dD[2] * w3;
+ pD[3+ps*2] += w3;
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ }
+ // fourth column
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ tmp = pD[3+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ // tau
+ dD[3] = 0.0;
+ }
+ else
+ {
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ // tau0
+ dD[3] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ // compute v0
+ pD[3+ps*3] = beta;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ }
+ }
+ return;
+ }
+
+
+
+// unblocked algorithm
+void kernel_dgelqf_vs_lib4(int m, int n, int k, int offD, double *pD, int sdd, double *dD)
+ {
+ if(m<=0 | n<=0)
+ return;
+ int ii, jj, kk, ll, imax, jmax, jmax0, kmax, kmax0;
+ const int ps = 4;
+ imax = k;//m<n ? m : n;
+ double alpha, beta, tmp;
+ double w00, w01,
+ w10, w11,
+ w20, w21,
+ w30, w31;
+ double *pC00, *pC10, *pC10a, *pC20, *pC20a, *pC01, *pC11;
+ double pT[4];
+ int ldt = 2;
+ double *pD0 = pD-offD;
+ ii = 0;
+#if 1
+ for(; ii<imax-1; ii+=2)
+ {
+ // first row
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC00[0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ps*jj] *= tmp;
+ }
+ pC10 = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ kmax = n-ii;
+ w00 = pC10[0+ps*0]; // pC00[0+ps*0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ pC10[0+ps*0] += w00; // pC00[0+ps*0] = 1.0
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ // second row
+ pC11 = pC10+ps*1;
+ beta = 0.0;
+ for(jj=1; jj<n-(ii+1); jj++)
+ {
+ tmp = pC11[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[(ii+1)] = 0.0;
+ }
+ else
+ {
+ alpha = pC11[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[(ii+1)] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC11[0+ps*0] = beta;
+ for(jj=1; jj<n-(ii+1); jj++)
+ pC11[0+ps*jj] *= tmp;
+ }
+ // compute T
+ kmax = n-ii;
+ tmp = 1.0*0.0 + pC00[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ tmp += pC00[0+ps*kk]*pC10[0+ps*kk];
+ pT[0+ldt*0] = dD[ii+0];
+ pT[0+ldt*1] = - dD[ii+1] * tmp * dD[ii+0];
+ pT[1+ldt*1] = dD[ii+1];
+ // downgrade
+ kmax = n-ii;
+ jmax = m-ii-2;
+ jmax0 = (ps-((ii+2+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ jj = 0;
+ pC20a = &pD0[((offD+ii+2)&(ps-1))+((offD+ii+2)-((offD+ii+2)&(ps-1)))*sdd+ii*ps];
+ pC20 = pC20a;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+ w00 = - w00*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ }
+ pC20 += 1;
+ }
+ pC20 += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w10 = pC20[1+ps*0]*1.0 + pC20[1+ps*1]*pC00[0+ps*1];
+ w20 = pC20[2+ps*0]*1.0 + pC20[2+ps*1]*pC00[0+ps*1];
+ w30 = pC20[3+ps*0]*1.0 + pC20[3+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ w11 = pC20[1+ps*0]*0.0 + pC20[1+ps*1]*1.0;
+ w21 = pC20[2+ps*0]*0.0 + pC20[2+ps*1]*1.0;
+ w31 = pC20[3+ps*0]*0.0 + pC20[3+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w10 += pC20[1+ps*kk]*pC00[0+ps*kk];
+ w20 += pC20[2+ps*kk]*pC00[0+ps*kk];
+ w30 += pC20[3+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ w11 += pC20[1+ps*kk]*pC10[0+ps*kk];
+ w21 += pC20[2+ps*kk]*pC10[0+ps*kk];
+ w31 += pC20[3+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+ w11 = - w10*pT[0+ldt*1] - w11*pT[1+ldt*1];
+ w21 = - w20*pT[0+ldt*1] - w21*pT[1+ldt*1];
+ w31 = - w30*pT[0+ldt*1] - w31*pT[1+ldt*1];
+ w00 = - w00*pT[0+ldt*0];
+ w10 = - w10*pT[0+ldt*0];
+ w20 = - w20*pT[0+ldt*0];
+ w30 = - w30*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[1+ps*0] += w10*1.0 + w11*0.0;
+ pC20[2+ps*0] += w20*1.0 + w21*0.0;
+ pC20[3+ps*0] += w30*1.0 + w31*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ pC20[1+ps*1] += w10*pC00[0+ps*1] + w11*1.0;
+ pC20[2+ps*1] += w20*pC00[0+ps*1] + w21*1.0;
+ pC20[3+ps*1] += w30*pC00[0+ps*1] + w31*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ pC20[1+ps*kk] += w10*pC00[0+ps*kk] + w11*pC10[0+ps*kk];
+ pC20[2+ps*kk] += w20*pC00[0+ps*kk] + w21*pC10[0+ps*kk];
+ pC20[3+ps*kk] += w30*pC00[0+ps*kk] + w31*pC10[0+ps*kk];
+ }
+ pC20 += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ w00 = pC20[0+ps*0]*1.0 + pC20[0+ps*1]*pC00[0+ps*1];
+ w01 = pC20[0+ps*0]*0.0 + pC20[0+ps*1]*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ w00 += pC20[0+ps*kk]*pC00[0+ps*kk];
+ w01 += pC20[0+ps*kk]*pC10[0+ps*kk];
+ }
+ w01 = - w00*pT[0+ldt*1] - w01*pT[1+ldt*1];
+ w00 = - w00*pT[0+ldt*0];
+ pC20[0+ps*0] += w00*1.0 + w01*0.0;
+ pC20[0+ps*1] += w00*pC00[0+ps*1] + w01*1.0;
+ for(kk=2; kk<kmax; kk++)
+ {
+ pC20[0+ps*kk] += w00*pC00[0+ps*kk] + w01*pC10[0+ps*kk];
+ }
+ pC20 += 1;
+ }
+ }
+#endif
+ for(; ii<imax; ii++)
+ {
+ pC00 = &pD0[((offD+ii)&(ps-1))+((offD+ii)-((offD+ii)&(ps-1)))*sdd+ii*ps];
+ beta = 0.0;
+ for(jj=1; jj<n-ii; jj++)
+ {
+ tmp = pC00[0+ps*jj];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[ii] = 0.0;
+ }
+ else
+ {
+ alpha = pC00[0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[ii] = (beta-alpha) / beta;
+ tmp = 1.0 / (alpha-beta);
+ pC00[0] = beta;
+ for(jj=1; jj<n-ii; jj++)
+ pC00[0+ps*jj] *= tmp;
+ }
+ if(ii<n)
+ {
+ kmax = n-ii;
+ jmax = m-ii-1;
+ jmax0 = (ps-((ii+1+offD)&(ps-1)))&(ps-1);
+ jmax0 = jmax<jmax0 ? jmax : jmax0;
+ jj = 0;
+ pC10a = &pD0[((offD+ii+1)&(ps-1))+((offD+ii+1)-((offD+ii+1)&(ps-1)))*sdd+ii*ps];
+ pC10 = pC10a;
+ if(jmax0>0)
+ {
+ for( ; jj<jmax0; jj++)
+ {
+ w00 = pC10[0+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ pC10[0+ps*0] += w00;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ pC10 += 1;
+ }
+ pC10 += -ps+ps*sdd;
+ }
+ for( ; jj<jmax-3; jj+=4)
+ {
+ w00 = pC10[0+ps*0];
+ w10 = pC10[1+ps*0];
+ w20 = pC10[2+ps*0];
+ w30 = pC10[3+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk]*pC00[0+ps*kk];
+ w10 += pC10[1+ps*kk]*pC00[0+ps*kk];
+ w20 += pC10[2+ps*kk]*pC00[0+ps*kk];
+ w30 += pC10[3+ps*kk]*pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ w10 = - w10*dD[ii];
+ w20 = - w20*dD[ii];
+ w30 = - w30*dD[ii];
+ pC10[0+ps*0] += w00;
+ pC10[1+ps*0] += w10;
+ pC10[2+ps*0] += w20;
+ pC10[3+ps*0] += w30;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00*pC00[0+ps*kk];
+ pC10[1+ps*kk] += w10*pC00[0+ps*kk];
+ pC10[2+ps*kk] += w20*pC00[0+ps*kk];
+ pC10[3+ps*kk] += w30*pC00[0+ps*kk];
+ }
+ pC10 += ps*sdd;
+ }
+ for(ll=0; ll<jmax-jj; ll++)
+ {
+ w00 = pC10[0+ps*0];
+ for(kk=1; kk<kmax; kk++)
+ {
+ w00 += pC10[0+ps*kk] * pC00[0+ps*kk];
+ }
+ w00 = - w00*dD[ii];
+ pC10[0+ps*0] += w00;
+ for(kk=1; kk<kmax; kk++)
+ {
+ pC10[0+ps*kk] += w00 * pC00[0+ps*kk];
+ }
+ pC10 += 1;
+ }
+ }
+ }
+ return;
+ }
+
+
+
+// assume kmax>=4
+void kernel_dlarft_4_lib4(int kmax, double *pD, double *dD, double *pT)
+ {
+ const int ps = 4;
+ int kk;
+ double v10,
+ v20, v21,
+ v30, v31, v32;
+ // 0
+ // 1
+ v10 = pD[0+ps*1];
+ // 2
+ v10 += pD[1+ps*2]*pD[0+ps*2];
+ v20 = pD[0+ps*2];
+ v21 = pD[1+ps*2];
+ // 3
+ v10 += pD[1+ps*3]*pD[0+ps*3];
+ v20 += pD[2+ps*3]*pD[0+ps*3];
+ v21 += pD[2+ps*3]*pD[1+ps*3];
+ v30 = pD[0+ps*3];
+ v31 = pD[1+ps*3];
+ v32 = pD[2+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ v10 += pD[1+ps*kk]*pD[0+ps*kk];
+ v20 += pD[2+ps*kk]*pD[0+ps*kk];
+ v30 += pD[3+ps*kk]*pD[0+ps*kk];
+ v21 += pD[2+ps*kk]*pD[1+ps*kk];
+ v31 += pD[3+ps*kk]*pD[1+ps*kk];
+ v32 += pD[3+ps*kk]*pD[2+ps*kk];
+ }
+ pT[0+ps*0] = - dD[0];
+ pT[1+ps*1] = - dD[1];
+ pT[2+ps*2] = - dD[2];
+ pT[3+ps*3] = - dD[3];
+ pT[0+ps*1] = - dD[1] * (v10*pT[0+ps*0]);
+ pT[1+ps*2] = - dD[2] * (v21*pT[1+ps*1]);
+ pT[2+ps*3] = - dD[3] * (v32*pT[2+ps*2]);
+ pT[0+ps*2] = - dD[2] * (v20*pT[0+ps*0] + v21*pT[0+ps*1]);
+ pT[1+ps*3] = - dD[3] * (v31*pT[1+ps*1] + v32*pT[1+ps*2]);
+ pT[0+ps*3] = - dD[3] * (v30*pT[0+ps*0] + v31*pT[0+ps*1] + v32*pT[0+ps*2]);
+ return;
+ }
+
+
+
+// assume n>=4
+void kernel_dgelqf_dlarft4_4_lib4(int n, double *pD, double *dD, double *pT)
+ {
+ int ii, jj, ll;
+ double alpha, beta, tmp, w0, w1, w2, w3;
+ const int ps = 4;
+ // zero tau matrix
+ for(ii=0; ii<16; ii++)
+ pT[ii] = 0.0;
+ // first column
+ beta = 0.0;
+ for(ii=1; ii<n; ii++)
+ {
+ tmp = pD[0+ps*ii];
+ beta += tmp*tmp;
+ }
+ if(beta==0.0)
+ {
+ dD[0] = 0.0;
+ tmp = 0.0;
+ goto col2;
+ }
+ alpha = pD[0+ps*0];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[0] = (beta-alpha) / beta;
+ pT[0+ps*0] = - dD[0];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[0+ps*0] = beta;
+ w1 = pD[1+ps*0];
+ w2 = pD[2+ps*0];
+ w3 = pD[3+ps*0];
+ //
+ pD[0+ps*1] *= tmp;
+ w1 += pD[1+ps*1] * pD[0+ps*1];
+ w2 += pD[2+ps*1] * pD[0+ps*1];
+ w3 += pD[3+ps*1] * pD[0+ps*1];
+ //
+ pD[0+ps*2] *= tmp;
+ w1 += pD[1+ps*2] * pD[0+ps*2];
+ w2 += pD[2+ps*2] * pD[0+ps*2];
+ w3 += pD[3+ps*2] * pD[0+ps*2];
+ //
+ pD[0+ps*3] *= tmp;
+ w1 += pD[1+ps*3] * pD[0+ps*3];
+ w2 += pD[2+ps*3] * pD[0+ps*3];
+ w3 += pD[3+ps*3] * pD[0+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[0+ps*ii] *= tmp;
+ w1 += pD[1+ps*ii] * pD[0+ps*ii];
+ w2 += pD[2+ps*ii] * pD[0+ps*ii];
+ w3 += pD[3+ps*ii] * pD[0+ps*ii];
+ }
+ //
+ w1 = - dD[0] * w1;
+ w2 = - dD[0] * w2;
+ w3 = - dD[0] * w3;
+ //
+ pD[1+ps*0] += w1;
+ pD[2+ps*0] += w2;
+ pD[3+ps*0] += w3;
+ //
+ pD[1+ps*1] += w1 * pD[0+ps*1];
+ pD[2+ps*1] += w2 * pD[0+ps*1];
+ pD[3+ps*1] += w3 * pD[0+ps*1];
+ //
+ pD[1+ps*2] += w1 * pD[0+ps*2];
+ pD[2+ps*2] += w2 * pD[0+ps*2];
+ pD[3+ps*2] += w3 * pD[0+ps*2];
+ beta = pD[1+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] += w1 * pD[0+ps*3];
+ pD[2+ps*3] += w2 * pD[0+ps*3];
+ pD[3+ps*3] += w3 * pD[0+ps*3];
+ beta += pD[1+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] += w1 * pD[0+ps*ii];
+ pD[2+ps*ii] += w2 * pD[0+ps*ii];
+ pD[3+ps*ii] += w3 * pD[0+ps*ii];
+ beta += pD[1+ps*ii] * pD[1+ps*ii];
+ }
+ // second column
+col2:
+ if(beta==0.0)
+ {
+ dD[1] = 0.0;
+ tmp = 0.0;
+ goto col3;
+ }
+ alpha = pD[1+ps*1];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[1] = (beta-alpha) / beta;
+ pT[1+ps*1] = - dD[1];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[1+ps*1] = beta;
+ w0 = pD[0+ps*1]; //
+ w2 = pD[2+ps*1];
+ w3 = pD[3+ps*1];
+ //
+ pD[1+ps*2] *= tmp;
+ w0 += pD[0+ps*2] * pD[1+ps*2]; //
+ w2 += pD[2+ps*2] * pD[1+ps*2];
+ w3 += pD[3+ps*2] * pD[1+ps*2];
+ //
+ pD[1+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[1+ps*3]; //
+ w2 += pD[2+ps*3] * pD[1+ps*3];
+ w3 += pD[3+ps*3] * pD[1+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[1+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[1+ps*ii]; //
+ w2 += pD[2+ps*ii] * pD[1+ps*ii];
+ w3 += pD[3+ps*ii] * pD[1+ps*ii];
+ }
+ //
+ pT[0+ps*1] = - dD[1] * (w0*pT[0+ps*0]);
+ w2 = - dD[1] * w2;
+ w3 = - dD[1] * w3;
+ //
+ pD[2+ps*1] += w2;
+ pD[3+ps*1] += w3;
+ //
+ pD[2+ps*2] += w2 * pD[1+ps*2];
+ pD[3+ps*2] += w3 * pD[1+ps*2];
+ //
+ pD[2+ps*3] += w2 * pD[1+ps*3];
+ pD[3+ps*3] += w3 * pD[1+ps*3];
+ beta = pD[2+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] += w2 * pD[1+ps*ii];
+ pD[3+ps*ii] += w3 * pD[1+ps*ii];
+ beta += pD[2+ps*ii] * pD[2+ps*ii];
+ }
+ // third column
+col3:
+ if(beta==0.0)
+ {
+ dD[2] = 0.0;
+ tmp = 0.0;
+ goto col4;
+ }
+ alpha = pD[2+ps*2];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[2] = (beta-alpha) / beta;
+ pT[2+ps*2] = - dD[2];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[2+ps*2] = beta;
+ w0 = pD[0+ps*2];
+ w1 = pD[1+ps*2];
+ w3 = pD[3+ps*2];
+ //
+ pD[2+ps*3] *= tmp;
+ w0 += pD[0+ps*3] * pD[2+ps*3];
+ w1 += pD[1+ps*3] * pD[2+ps*3];
+ w3 += pD[3+ps*3] * pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[2+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[2+ps*ii];
+ w1 += pD[1+ps*ii] * pD[2+ps*ii];
+ w3 += pD[3+ps*ii] * pD[2+ps*ii];
+ }
+ //
+ pT[1+ps*2] = - dD[2] * (w1*pT[1+ps*1]);
+ pT[0+ps*2] = - dD[2] * (w0*pT[0+ps*0] + w1*pT[0+ps*1]);
+ w3 = - dD[2] * w3;
+ //
+ pD[3+ps*2] += w3;
+ //
+ pD[3+ps*3] += w3 * pD[2+ps*3];
+ //
+ beta = 0.0;
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] += w3 * pD[2+ps*ii];
+ beta += pD[3+ps*ii] * pD[3+ps*ii];
+ }
+ // fourth column
+col4:
+ if(beta==0.0)
+ {
+ dD[3] = 0.0;
+ tmp = 0.0;
+ return;
+ }
+ alpha = pD[3+ps*3];
+ beta += alpha*alpha;
+ beta = sqrt(beta);
+ if(alpha>0)
+ beta = -beta;
+ dD[3] = (beta-alpha) / beta;
+ pT[3+ps*3] = - dD[3];
+ tmp = 1.0 / (alpha-beta);
+ //
+ pD[3+ps*3] = beta;
+ w0 = pD[0+ps*3];
+ w1 = pD[1+ps*3];
+ w2 = pD[2+ps*3];
+ //
+ for(ii=4; ii<n; ii++)
+ {
+ pD[3+ps*ii] *= tmp;
+ w0 += pD[0+ps*ii] * pD[3+ps*ii];
+ w1 += pD[1+ps*ii] * pD[3+ps*ii];
+ w2 += pD[2+ps*ii] * pD[3+ps*ii];
+ }
+ //
+ pT[2+ps*3] = - dD[3] * (w2*pT[2+ps*2]);
+ pT[1+ps*3] = - dD[3] * (w1*pT[1+ps*1] + w2*pT[1+ps*2]);
+ pT[0+ps*3] = - dD[3] * (w0*pT[0+ps*0] + w1*pT[0+ps*1] + w2*pT[0+ps*2]);
+ return;
+ }
+
+
+
+void kernel_dlarfb4_r_4_lib4(int kmax, double *pV, double *pT, double *pD)
+ {
+ const int ps = 4;
+ double pW[16];
+ int kk;
+ // 0
+ pW[0+ps*0] = pD[0+ps*0];
+ pW[1+ps*0] = pD[1+ps*0];
+ pW[2+ps*0] = pD[2+ps*0];
+ pW[3+ps*0] = pD[3+ps*0];
+ // 1
+ pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+ pW[1+ps*0] += pD[1+ps*1]*pV[0+ps*1];
+ pW[2+ps*0] += pD[2+ps*1]*pV[0+ps*1];
+ pW[3+ps*0] += pD[3+ps*1]*pV[0+ps*1];
+ pW[0+ps*1] = pD[0+ps*1];
+ pW[1+ps*1] = pD[1+ps*1];
+ pW[2+ps*1] = pD[2+ps*1];
+ pW[3+ps*1] = pD[3+ps*1];
+ // 2
+ pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+ pW[1+ps*0] += pD[1+ps*2]*pV[0+ps*2];
+ pW[2+ps*0] += pD[2+ps*2]*pV[0+ps*2];
+ pW[3+ps*0] += pD[3+ps*2]*pV[0+ps*2];
+ pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+ pW[1+ps*1] += pD[1+ps*2]*pV[1+ps*2];
+ pW[2+ps*1] += pD[2+ps*2]*pV[1+ps*2];
+ pW[3+ps*1] += pD[3+ps*2]*pV[1+ps*2];
+ pW[0+ps*2] = pD[0+ps*2];
+ pW[1+ps*2] = pD[1+ps*2];
+ pW[2+ps*2] = pD[2+ps*2];
+ pW[3+ps*2] = pD[3+ps*2];
+ // 3
+ pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+ pW[1+ps*0] += pD[1+ps*3]*pV[0+ps*3];
+ pW[2+ps*0] += pD[2+ps*3]*pV[0+ps*3];
+ pW[3+ps*0] += pD[3+ps*3]*pV[0+ps*3];
+ pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+ pW[1+ps*1] += pD[1+ps*3]*pV[1+ps*3];
+ pW[2+ps*1] += pD[2+ps*3]*pV[1+ps*3];
+ pW[3+ps*1] += pD[3+ps*3]*pV[1+ps*3];
+ pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+ pW[1+ps*2] += pD[1+ps*3]*pV[2+ps*3];
+ pW[2+ps*2] += pD[2+ps*3]*pV[2+ps*3];
+ pW[3+ps*2] += pD[3+ps*3]*pV[2+ps*3];
+ pW[0+ps*3] = pD[0+ps*3];
+ pW[1+ps*3] = pD[1+ps*3];
+ pW[2+ps*3] = pD[2+ps*3];
+ pW[3+ps*3] = pD[3+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+ pW[1+ps*0] += pD[1+ps*kk]*pV[0+ps*kk];
+ pW[2+ps*0] += pD[2+ps*kk]*pV[0+ps*kk];
+ pW[3+ps*0] += pD[3+ps*kk]*pV[0+ps*kk];
+ pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+ pW[1+ps*1] += pD[1+ps*kk]*pV[1+ps*kk];
+ pW[2+ps*1] += pD[2+ps*kk]*pV[1+ps*kk];
+ pW[3+ps*1] += pD[3+ps*kk]*pV[1+ps*kk];
+ pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+ pW[1+ps*2] += pD[1+ps*kk]*pV[2+ps*kk];
+ pW[2+ps*2] += pD[2+ps*kk]*pV[2+ps*kk];
+ pW[3+ps*2] += pD[3+ps*kk]*pV[2+ps*kk];
+ pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+ pW[1+ps*3] += pD[1+ps*kk]*pV[3+ps*kk];
+ pW[2+ps*3] += pD[2+ps*kk]*pV[3+ps*kk];
+ pW[3+ps*3] += pD[3+ps*kk]*pV[3+ps*kk];
+ }
+ //
+ pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+ pW[1+ps*3] = pW[1+ps*0]*pT[0+ps*3] + pW[1+ps*1]*pT[1+ps*3] + pW[1+ps*2]*pT[2+ps*3] + pW[1+ps*3]*pT[3+ps*3];
+ pW[2+ps*3] = pW[2+ps*0]*pT[0+ps*3] + pW[2+ps*1]*pT[1+ps*3] + pW[2+ps*2]*pT[2+ps*3] + pW[2+ps*3]*pT[3+ps*3];
+ pW[3+ps*3] = pW[3+ps*0]*pT[0+ps*3] + pW[3+ps*1]*pT[1+ps*3] + pW[3+ps*2]*pT[2+ps*3] + pW[3+ps*3]*pT[3+ps*3];
+ //
+ pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+ pW[1+ps*2] = pW[1+ps*0]*pT[0+ps*2] + pW[1+ps*1]*pT[1+ps*2] + pW[1+ps*2]*pT[2+ps*2];
+ pW[2+ps*2] = pW[2+ps*0]*pT[0+ps*2] + pW[2+ps*1]*pT[1+ps*2] + pW[2+ps*2]*pT[2+ps*2];
+ pW[3+ps*2] = pW[3+ps*0]*pT[0+ps*2] + pW[3+ps*1]*pT[1+ps*2] + pW[3+ps*2]*pT[2+ps*2];
+ //
+ pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+ pW[1+ps*1] = pW[1+ps*0]*pT[0+ps*1] + pW[1+ps*1]*pT[1+ps*1];
+ pW[2+ps*1] = pW[2+ps*0]*pT[0+ps*1] + pW[2+ps*1]*pT[1+ps*1];
+ pW[3+ps*1] = pW[3+ps*0]*pT[0+ps*1] + pW[3+ps*1]*pT[1+ps*1];
+ //
+ pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+ pW[1+ps*0] = pW[1+ps*0]*pT[0+ps*0];
+ pW[2+ps*0] = pW[2+ps*0]*pT[0+ps*0];
+ pW[3+ps*0] = pW[3+ps*0]*pT[0+ps*0];
+ //
+ pD[0+ps*0] += pW[0+ps*0];
+ pD[1+ps*0] += pW[1+ps*0];
+ pD[2+ps*0] += pW[2+ps*0];
+ pD[3+ps*0] += pW[3+ps*0];
+ //
+ pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+ pD[1+ps*1] += pW[1+ps*0]*pV[0+ps*1] + pW[1+ps*1];
+ pD[2+ps*1] += pW[2+ps*0]*pV[0+ps*1] + pW[2+ps*1];
+ pD[3+ps*1] += pW[3+ps*0]*pV[0+ps*1] + pW[3+ps*1];
+ //
+ pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+ pD[1+ps*2] += pW[1+ps*0]*pV[0+ps*2] + pW[1+ps*1]*pV[1+ps*2] + pW[1+ps*2];
+ pD[2+ps*2] += pW[2+ps*0]*pV[0+ps*2] + pW[2+ps*1]*pV[1+ps*2] + pW[2+ps*2];
+ pD[3+ps*2] += pW[3+ps*0]*pV[0+ps*2] + pW[3+ps*1]*pV[1+ps*2] + pW[3+ps*2];
+ //
+ pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+ pD[1+ps*3] += pW[1+ps*0]*pV[0+ps*3] + pW[1+ps*1]*pV[1+ps*3] + pW[1+ps*2]*pV[2+ps*3] + pW[1+ps*3];
+ pD[2+ps*3] += pW[2+ps*0]*pV[0+ps*3] + pW[2+ps*1]*pV[1+ps*3] + pW[2+ps*2]*pV[2+ps*3] + pW[2+ps*3];
+ pD[3+ps*3] += pW[3+ps*0]*pV[0+ps*3] + pW[3+ps*1]*pV[1+ps*3] + pW[3+ps*2]*pV[2+ps*3] + pW[3+ps*3];
+ for(kk=4; kk<kmax; kk++)
+ {
+ pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+ pD[1+ps*kk] += pW[1+ps*0]*pV[0+ps*kk] + pW[1+ps*1]*pV[1+ps*kk] + pW[1+ps*2]*pV[2+ps*kk] + pW[1+ps*3]*pV[3+ps*kk];
+ pD[2+ps*kk] += pW[2+ps*0]*pV[0+ps*kk] + pW[2+ps*1]*pV[1+ps*kk] + pW[2+ps*2]*pV[2+ps*kk] + pW[2+ps*3]*pV[3+ps*kk];
+ pD[3+ps*kk] += pW[3+ps*0]*pV[0+ps*kk] + pW[3+ps*1]*pV[1+ps*kk] + pW[3+ps*2]*pV[2+ps*kk] + pW[3+ps*3]*pV[3+ps*kk];
+ }
+ return;
+ }
+
+
+
+void kernel_dlarfb4_r_1_lib4(int kmax, double *pV, double *pT, double *pD)
+ {
+ const int ps = 4;
+ double pW[16];
+ int kk;
+ // 0
+ pW[0+ps*0] = pD[0+ps*0];
+ // 1
+ pW[0+ps*0] += pD[0+ps*1]*pV[0+ps*1];
+ pW[0+ps*1] = pD[0+ps*1];
+ // 2
+ pW[0+ps*0] += pD[0+ps*2]*pV[0+ps*2];
+ pW[0+ps*1] += pD[0+ps*2]*pV[1+ps*2];
+ pW[0+ps*2] = pD[0+ps*2];
+ // 3
+ pW[0+ps*0] += pD[0+ps*3]*pV[0+ps*3];
+ pW[0+ps*1] += pD[0+ps*3]*pV[1+ps*3];
+ pW[0+ps*2] += pD[0+ps*3]*pV[2+ps*3];
+ pW[0+ps*3] = pD[0+ps*3];
+ //
+ for(kk=4; kk<kmax; kk++)
+ {
+ pW[0+ps*0] += pD[0+ps*kk]*pV[0+ps*kk];
+ pW[0+ps*1] += pD[0+ps*kk]*pV[1+ps*kk];
+ pW[0+ps*2] += pD[0+ps*kk]*pV[2+ps*kk];
+ pW[0+ps*3] += pD[0+ps*kk]*pV[3+ps*kk];
+ }
+ //
+ pW[0+ps*3] = pW[0+ps*0]*pT[0+ps*3] + pW[0+ps*1]*pT[1+ps*3] + pW[0+ps*2]*pT[2+ps*3] + pW[0+ps*3]*pT[3+ps*3];
+ //
+ pW[0+ps*2] = pW[0+ps*0]*pT[0+ps*2] + pW[0+ps*1]*pT[1+ps*2] + pW[0+ps*2]*pT[2+ps*2];
+ //
+ pW[0+ps*1] = pW[0+ps*0]*pT[0+ps*1] + pW[0+ps*1]*pT[1+ps*1];
+ //
+ pW[0+ps*0] = pW[0+ps*0]*pT[0+ps*0];
+ //
+ pD[0+ps*0] += pW[0+ps*0];
+ //
+ pD[0+ps*1] += pW[0+ps*0]*pV[0+ps*1] + pW[0+ps*1];
+ //
+ pD[0+ps*2] += pW[0+ps*0]*pV[0+ps*2] + pW[0+ps*1]*pV[1+ps*2] + pW[0+ps*2];
+ //
+ pD[0+ps*3] += pW[0+ps*0]*pV[0+ps*3] + pW[0+ps*1]*pV[1+ps*3] + pW[0+ps*2]*pV[2+ps*3] + pW[0+ps*3];
+ for(kk=4; kk<kmax; kk++)
+ {
+ pD[0+ps*kk] += pW[0+ps*0]*pV[0+ps*kk] + pW[0+ps*1]*pV[1+ps*kk] + pW[0+ps*2]*pV[2+ps*kk] + pW[0+ps*3]*pV[3+ps*kk];
+ }
+ return;
+ }
diff --git a/kernel/c99/kernel_dgetrf_pivot_4_lib4.c b/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..787322e
--- /dev/null
+++ b/kernel/c99/kernel_dgetrf_pivot_4_lib4.c
@@ -0,0 +1,779 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_d_aux.h"
+
+
+
+// C numbering, starting from 0
+void didamax_lib4(int n, int offset, double *pA, int sda, int *p_idamax, double *p_amax)
+ {
+
+ int idamax, ii;
+ double tmp, amax;
+
+ p_idamax[0] = -1;
+ if(n<1)
+ return;
+
+ const int bs = 4;
+
+ int na = (bs - offset%bs)%bs;
+ na = n<na ? n : na;
+
+ amax = -1.0;
+ ii = 0;
+ if(na>0)
+ {
+ for( ; ii<na; ii++)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ pA += 1;
+ }
+ pA += bs*(sda-1);
+ }
+ for( ; ii<n-3; ii+=4)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ tmp = fabs(pA[1]);
+ if(tmp>amax)
+ {
+ idamax = ii+1;
+ amax = tmp;
+ }
+ tmp = fabs(pA[2]);
+ if(tmp>amax)
+ {
+ idamax = ii+2;
+ amax = tmp;
+ }
+ tmp = fabs(pA[3]);
+ if(tmp>amax)
+ {
+ idamax = ii+3;
+ amax = tmp;
+ }
+ pA += bs*sda;
+ }
+ for( ; ii<n; ii++)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ pA += 1;
+ }
+
+ p_amax[0] = amax;
+ p_idamax[0] = idamax;
+
+ return;
+
+ }
+
+
+
+// C numering (starting from zero) in the ipiv
+// it process m>=4 rows and 4 cols
+void kernel_dgetrf_pivot_4_lib4(int m, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ double
+ tmp0, tmp1, tmp2, tmp3,
+ u_00, u_01, u_02, u_03,
+ u_11, u_12, u_13,
+ u_22, u_23,
+ u_33;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ // first column
+ didamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ tmp0 = 1.0 / pA[0+bs*0];
+ inv_diag_A[0] = tmp0;
+ pA[1+bs*0] *= tmp0;
+ pA[2+bs*0] *= tmp0;
+ pA[3+bs*0] *= tmp0;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB[1+bs*0] *= tmp0;
+ pB[2+bs*0] *= tmp0;
+ pB[3+bs*0] *= tmp0;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[0] = 0.0;
+ }
+
+ // second column
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp2 = pA[2+bs*1];
+ tmp3 = pA[3+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ tmp2 -= pA[2+bs*0] * u_01;
+ tmp3 -= pA[3+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ pA[2+bs*1] = tmp2;
+ pA[3+bs*1] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp1 = pB[1+bs*1];
+ tmp2 = pB[2+bs*1];
+ tmp3 = pB[3+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ tmp1 -= pB[1+bs*0] * u_01;
+ tmp2 -= pB[2+bs*0] * u_01;
+ tmp3 -= pB[3+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB[1+bs*1] = tmp1;
+ pB[2+bs*1] = tmp2;
+ pB[3+bs*1] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB += 1;
+ }
+
+ didamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+ ipiv[1] = idamax+1;
+ if(tmp1!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ tmp1 = 1.0 / pA[1+bs*1];
+ inv_diag_A[1] = tmp1;
+ pA[2+bs*1] *= tmp1;
+ pA[3+bs*1] *= tmp1;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB[1+bs*1] *= tmp1;
+ pB[2+bs*1] *= tmp1;
+ pB[3+bs*1] *= tmp1;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[1] = 0.0;
+ }
+
+ // third column
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ tmp2 = pA[2+bs*2];
+ tmp3 = pA[3+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp3 -= pA[3+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ tmp3 -= pA[3+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ pA[3+bs*2] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp1 = pB[1+bs*2];
+ tmp2 = pB[2+bs*2];
+ tmp3 = pB[3+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp1 -= pB[1+bs*0] * u_02;
+ tmp2 -= pB[2+bs*0] * u_02;
+ tmp3 -= pB[3+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ tmp1 -= pB[1+bs*1] * u_12;
+ tmp2 -= pB[2+bs*1] * u_12;
+ tmp3 -= pB[3+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB[1+bs*2] = tmp1;
+ pB[2+bs*2] = tmp2;
+ pB[3+bs*2] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB += 1;
+ }
+
+ didamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+ ipiv[2] = idamax+2;
+ if(tmp2!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ tmp2 = 1.0 / pA[2+bs*2];
+ inv_diag_A[2] = tmp2;
+ pA[3+bs*2] *= tmp2;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB[1+bs*2] *= tmp2;
+ pB[2+bs*2] *= tmp2;
+ pB[3+bs*2] *= tmp2;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[2] = 0.0;
+ }
+
+ // fourth column
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ tmp3 = pA[3+bs*3];
+ tmp3 -= pA[3+bs*0] * u_03;
+ tmp3 -= pA[3+bs*1] * u_13;
+ tmp3 -= pA[3+bs*2] * u_23;
+ pA[3+bs*3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp1 = pB[1+bs*3];
+ tmp2 = pB[2+bs*3];
+ tmp3 = pB[3+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp1 -= pB[1+bs*0] * u_03;
+ tmp2 -= pB[2+bs*0] * u_03;
+ tmp3 -= pB[3+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp1 -= pB[1+bs*1] * u_13;
+ tmp2 -= pB[2+bs*1] * u_13;
+ tmp3 -= pB[3+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ tmp1 -= pB[1+bs*2] * u_23;
+ tmp2 -= pB[2+bs*2] * u_23;
+ tmp3 -= pB[3+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB[1+bs*3] = tmp1;
+ pB[2+bs*3] = tmp2;
+ pB[3+bs*3] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB += 1;
+ }
+
+ didamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+ ipiv[3] = idamax+3;
+ if(tmp3!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ tmp3 = 1.0 / pA[3+bs*3];
+ inv_diag_A[3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB[1+bs*3] *= tmp3;
+ pB[2+bs*3] *= tmp3;
+ pB[3+bs*3] *= tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[3] = 0.0;
+ }
+
+ return;
+
+ }
+
+
+
+// it process m>0 rows and 0<n<=4 cols
+void kernel_dgetrf_pivot_4_vs_lib4(int m, int n, double *pA, int sda, double *inv_diag_A, int* ipiv)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ double
+ tmp0, tmp1, tmp2, tmp3,
+ u_00, u_01, u_02, u_03,
+ u_11, u_12, u_13,
+ u_22, u_23,
+ u_33;
+
+ double
+ *pB;
+
+ int
+ k, idamax;
+
+ // first column
+
+ // find pivot & scale
+ didamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ drowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ tmp0 = 1.0 / pA[0+bs*0];
+ inv_diag_A[0] = tmp0;
+ if(m>=4)
+ {
+ pA[1+bs*0] *= tmp0;
+ pA[2+bs*0] *= tmp0;
+ pA[3+bs*0] *= tmp0;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB[1+bs*0] *= tmp0;
+ pB[2+bs*0] *= tmp0;
+ pB[3+bs*0] *= tmp0;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {1,2,3}
+ {
+ if(m>1)
+ {
+ pA[1+bs*0] *= tmp0;
+ if(m>2)
+ pA[2+bs*0] *= tmp0;
+ }
+ }
+ }
+ else
+ {
+ inv_diag_A[0] = 0.0;
+ }
+
+ if(n==1 || m==1) // XXX for the first row there is nothing to do, so we can return here
+ return;
+
+ // second column
+
+ // correct
+ if(m>=4)
+ {
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp2 = pA[2+bs*1];
+ tmp3 = pA[3+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ tmp2 -= pA[2+bs*0] * u_01;
+ tmp3 -= pA[3+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ pA[2+bs*1] = tmp2;
+ pA[3+bs*1] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp1 = pB[1+bs*1];
+ tmp2 = pB[2+bs*1];
+ tmp3 = pB[3+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ tmp1 -= pB[1+bs*0] * u_01;
+ tmp2 -= pB[2+bs*0] * u_01;
+ tmp3 -= pB[3+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB[1+bs*1] = tmp1;
+ pB[2+bs*1] = tmp2;
+ pB[3+bs*1] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ if(m>2)
+ {
+ tmp2 = pA[2+bs*1];
+ tmp2 -= pA[2+bs*0] * u_01;
+ pA[2+bs*1] = tmp2;
+ }
+ }
+
+ // find pivot & scale
+ didamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+ ipiv[1] = idamax+1;
+ if(tmp1!=0)
+ {
+ if(ipiv[1]!=1)
+ drowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ tmp1 = 1.0 / pA[1+bs*1];
+ inv_diag_A[1] = tmp1;
+ if(m>=4)
+ {
+ pA[2+bs*1] *= tmp1;
+ pA[3+bs*1] *= tmp1;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB[1+bs*1] *= tmp1;
+ pB[2+bs*1] *= tmp1;
+ pB[3+bs*1] *= tmp1;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ if(m>2)
+ pA[2+bs*1] *= tmp1;
+ }
+ }
+ else
+ {
+ inv_diag_A[1] = 0.0;
+ }
+
+ if(n==2)
+ return;
+
+ // third column
+
+ // correct
+ if(m>=4)
+ {
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ tmp2 = pA[2+bs*2];
+ tmp3 = pA[3+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp3 -= pA[3+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ tmp3 -= pA[3+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ pA[3+bs*2] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp1 = pB[1+bs*2];
+ tmp2 = pB[2+bs*2];
+ tmp3 = pB[3+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp1 -= pB[1+bs*0] * u_02;
+ tmp2 -= pB[2+bs*0] * u_02;
+ tmp3 -= pB[3+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ tmp1 -= pB[1+bs*1] * u_12;
+ tmp2 -= pB[2+bs*1] * u_12;
+ tmp3 -= pB[3+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB[1+bs*2] = tmp1;
+ pB[2+bs*2] = tmp2;
+ pB[3+bs*2] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ if(m>2)
+ {
+ tmp2 = pA[2+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ }
+ }
+
+ // find pivot & scale
+ if(m>2)
+ {
+ didamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+ ipiv[2] = idamax+2;
+ if(tmp2!=0)
+ {
+ if(ipiv[2]!=2)
+ drowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ tmp2 = 1.0 / pA[2+bs*2];
+ inv_diag_A[2] = tmp2;
+ if(m>=4)
+ {
+ pA[3+bs*2] *= tmp2;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB[1+bs*2] *= tmp2;
+ pB[2+bs*2] *= tmp2;
+ pB[3+bs*2] *= tmp2;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB += 1;
+ }
+ }
+ }
+ else
+ {
+ inv_diag_A[2] = 0.0;
+ }
+ }
+
+ if(n<4)
+ return;
+
+ // fourth column
+
+ // correct
+ if(m>=4)
+ {
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ tmp3 = pA[3+bs*3];
+ tmp3 -= pA[3+bs*0] * u_03;
+ tmp3 -= pA[3+bs*1] * u_13;
+ tmp3 -= pA[3+bs*2] * u_23;
+ pA[3+bs*3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp1 = pB[1+bs*3];
+ tmp2 = pB[2+bs*3];
+ tmp3 = pB[3+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp1 -= pB[1+bs*0] * u_03;
+ tmp2 -= pB[2+bs*0] * u_03;
+ tmp3 -= pB[3+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp1 -= pB[1+bs*1] * u_13;
+ tmp2 -= pB[2+bs*1] * u_13;
+ tmp3 -= pB[3+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ tmp1 -= pB[1+bs*2] * u_23;
+ tmp2 -= pB[2+bs*2] * u_23;
+ tmp3 -= pB[3+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB[1+bs*3] = tmp1;
+ pB[2+bs*3] = tmp2;
+ pB[3+bs*3] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ if(m>2)
+ {
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ }
+ }
+
+ if(m>3)
+ {
+ // find pivot & scale
+ didamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+ ipiv[3] = idamax+3;
+ if(tmp3!=0)
+ {
+ if(ipiv[3]!=3)
+ drowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ tmp3 = 1.0 / pA[3+bs*3];
+ inv_diag_A[3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB[1+bs*3] *= tmp3;
+ pB[2+bs*3] *= tmp3;
+ pB[3+bs*3] *= tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[3] = 0.0;
+ }
+ }
+
+ return;
+
+ }
+
+
+
+
+
diff --git a/kernel/c99/kernel_dsymv_4_lib4.c b/kernel/c99/kernel_dsymv_4_lib4.c
new file mode 100644
index 0000000..bed4300
--- /dev/null
+++ b/kernel/c99/kernel_dsymv_4_lib4.c
@@ -0,0 +1,1024 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+void kernel_dgemv_nt_4_vs_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha_n[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha_n[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha_n[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha_n[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ // store t
+ z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
+ if(km>1)
+ {
+ z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
+ if(km>2)
+ {
+ z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
+ if(km>3)
+ {
+ z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+void kernel_dgemv_nt_4_lib4(int kmax, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t)
+ {
+
+ kernel_dgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+void kernel_dsymv_l_4_gen_lib4(int kmax, double *alpha, int offA, double *A, int sda, double *x_n, double *z_n, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ double *x_t = x_n;
+ double *z_t = z_n;
+
+ const int bs = 4;
+
+ int k;
+
+ double
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ if(offA==0)
+ {
+ if(kmax<4)
+ {
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+ goto store_t;
+ }
+ else
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+ k += 4;
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ }
+ else if(offA==1)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==6)
+ goto store_t;
+
+ // 6
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==7)
+ goto store_t;
+
+ k += 7;
+
+ }
+ else if(offA==2)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==6)
+ goto store_t;
+
+ k += 6;
+
+ }
+ else // if(offA==3)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==5)
+ goto store_t;
+
+ k += 5;
+
+ }
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ store_t:
+ z_t[0] += alpha[0]*y_t_0;
+ if(km>1)
+ {
+ z_t[1] += alpha[0]*y_t_1;
+ if(km>2)
+ {
+ z_t[2] += alpha[0]*y_t_2;
+ if(km>3)
+ {
+ z_t[3] += alpha[0]*y_t_3;
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if ! ( defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) )
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+void kernel_dsymv_l_4_lib4(int kmax, double *alpha, double *A, int sda, double *x_n, double *z_n)
+ {
+
+ kernel_dsymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+
diff --git a/kernel/c99/kernel_sgecp_lib4.c b/kernel/c99/kernel_sgecp_lib4.c
new file mode 100644
index 0000000..de5b704
--- /dev/null
+++ b/kernel/c99/kernel_sgecp_lib4.c
@@ -0,0 +1,1148 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgesc_4_lib4(int kmax, float *alphap, float *A)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ A[0+bs*0] *= alpha;
+ A[1+bs*0] *= alpha;
+ A[2+bs*0] *= alpha;
+ A[3+bs*0] *= alpha;
+
+ A += 4;
+
+ }
+
+ }
+
+
+
+void kernel_sgesc_3_lib4(int kmax, float *alphap, float *A)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ A[0+bs*0] *= alpha;
+ A[1+bs*0] *= alpha;
+ A[2+bs*0] *= alpha;
+
+ A += 4;
+
+ }
+
+ }
+
+
+
+void kernel_sgesc_2_lib4(int kmax, float *alphap, float *A)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ A[0+bs*0] *= alpha;
+ A[1+bs*0] *= alpha;
+
+ A += 4;
+
+ }
+
+ }
+
+
+
+void kernel_sgesc_1_lib4(int kmax, float *alphap, float *A)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ A[0+bs*0] *= alpha;
+
+ A += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgecp_4_0_lib4(int kmax, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+ B[3+bs*0] = A[3+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_sgecp_4_1_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[1+bs*0];
+ B[1+bs*0] = A0[2+bs*0];
+ B[2+bs*0] = A0[3+bs*0];
+ B[3+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgecp_4_2_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[2+bs*0];
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+ B[3+bs*0] = A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_4_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+ B[3+bs*0] = A1[2+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgecp_3_0_lib4(int kmax, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgecp_3_2_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[2+bs*0];
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_3_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgecp_2_0_lib4(int kmax, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_sgecp_2_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_sgecp_1_0_lib4(int kmax, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_strcp_l_4_0_lib4(int kmax, float *A, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+ B[3+bs*0] = A[3+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ // 3x3 triangle
+
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+ B[3+bs*0] = A[3+bs*0];
+
+ B[2+bs*1] = A[2+bs*1];
+ B[3+bs*1] = A[3+bs*1];
+
+ B[3+bs*2] = A[3+bs*2];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_strcp_l_4_1_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[1+bs*0];
+ B[1+bs*0] = A0[2+bs*0];
+ B[2+bs*0] = A0[3+bs*0];
+ B[3+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 3x3 triangle
+
+ B[1+0*bs] = A0[2+0*bs];
+ B[2+0*bs] = A0[3+0*bs];
+ B[3+0*bs] = A1[0+0*bs];
+
+ B[2+1*bs] = A0[3+1*bs];
+ B[3+1*bs] = A1[0+1*bs];
+
+ B[3+2*bs] = A1[0+2*bs];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_strcp_l_4_2_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[2+bs*0];
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+ B[3+bs*0] = A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 3x3 triangle}
+
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+ B[3+bs*0] = A1[1+bs*0];
+
+ B[2+bs*1] = A1[0+bs*1];
+ B[3+bs*1] = A1[1+bs*1];
+
+ B[3+bs*2] = A1[1+bs*2];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_4_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 4-wide + end 3x3 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+ B[3+bs*0] = A1[2+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 3x3 triangle
+
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+ B[3+bs*0] = A1[2+bs*0];
+
+ B[2+bs*1] = A1[1+bs*1];
+ B[3+bs*1] = A1[2+bs*1];
+
+ B[3+bs*2] = A1[2+bs*2];
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_strcp_l_3_0_lib4(int kmax, float *A, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ // 2x2 triangle
+
+ B[1+bs*0] = A[1+bs*0];
+ B[2+bs*0] = A[2+bs*0];
+
+ B[2+bs*1] = A[2+bs*1];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_strcp_l_3_2_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[2+bs*0];
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 2x2 triangle
+
+ B[1+bs*0] = A0[3+bs*0];
+ B[2+bs*0] = A1[0+bs*0];
+
+ B[2+bs*1] = A1[0+bs*1];
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_3_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 3-wide + end 2x2 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 2x2 triangle
+
+ B[1+bs*0] = A1[0+bs*0];
+ B[2+bs*0] = A1[1+bs*0];
+
+ B[2+bs*1] = A1[1+bs*1];
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_strcp_l_2_0_lib4(int kmax, float alpha, float *A, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+ B[1+bs*0] = A[1+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ // 1x1 triangle
+
+ B[1+bs*0] = A[1+bs*0];
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_strcp_l_2_3_lib4(int kmax, float *A0, int sda, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 2-wide + end 1x1 triangle
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A0[3+bs*0];
+ B[1+bs*0] = A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ // 1x1 triangle
+
+ B[1+bs*0] = A1[0+bs*0];
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_strcp_l_1_0_lib4(int kmax, float *A, float *B)
+ {
+
+ // A and C are lower triangular
+ // kmax+1 1-wide
+
+ kmax += 1;
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] = A[0+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+
+// both A and B are aligned to 256-bit boundaries
+void kernel_sgead_4_0_lib4(int kmax, float *alphap, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+ B[3+bs*0] += alpha * A[3+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
+void kernel_sgead_4_1_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[1+bs*0];
+ B[1+bs*0] += alpha * A0[2+bs*0];
+ B[2+bs*0] += alpha * A0[3+bs*0];
+ B[3+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgead_4_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+ B[3+bs*0] += alpha * A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_4_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+ B[3+bs*0] += alpha * A1[2+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgead_3_0_lib4(int kmax, float *alphap, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+ B[2+bs*0] += alpha * A[2+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
+void kernel_sgead_3_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[2+bs*0];
+ B[1+bs*0] += alpha * A0[3+bs*0];
+ B[2+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_3_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+ B[2+bs*0] += alpha * A1[1+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 64-bit boundaries
+void kernel_sgead_2_0_lib4(int kmax, float *alphap, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+ B[1+bs*0] += alpha * A[1+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
+void kernel_sgead_2_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ float *A1 = A0 + bs*sda;
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A0[3+bs*0];
+ B[1+bs*0] += alpha * A1[0+bs*0];
+
+ A0 += 4;
+ A1 += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+// both A and B are aligned 64-bit boundaries
+void kernel_sgead_1_0_lib4(int kmax, float *alphap, float *A, float *B)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ float alpha = alphap[0];
+
+ int k;
+
+ for(k=0; k<kmax; k++)
+ {
+
+ B[0+bs*0] += alpha * A[0+bs*0];
+
+ A += 4;
+ B += 4;
+
+ }
+
+ }
+
+
+
+
+
diff --git a/kernel/c99/kernel_sgemm_4x4_lib4.c b/kernel/c99/kernel_sgemm_4x4_lib4.c
new file mode 100644
index 0000000..243d559
--- /dev/null
+++ b/kernel/c99/kernel_sgemm_4x4_lib4.c
@@ -0,0 +1,6094 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nt_4x4_gen_lib4(int kmax, float *alpha, float *A, float *B, float *beta, int offsetC, float *C0, int sdc, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ float
+ *C1, *D1;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ if(offsetC==0)
+ {
+ c_00 = beta[0]*C0[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C0[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C0[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C0[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C0[3+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==1)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[1+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[2+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C0[3+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[0+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[1+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[2+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C0[3+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[0+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[1+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[2+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C0[3+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[0+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[1+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[2+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C0[3+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[0+bs*3] + alpha[0]*c_33;
+ }
+ else if(offsetC==2)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[2+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C0[3+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[0+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[1+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[2+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C0[3+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[0+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[1+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[2+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C0[3+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[0+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[1+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[2+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C0[3+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[0+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[1+bs*3] + alpha[0]*c_33;
+ }
+ else //if(offsetC==3)
+ {
+ C1 = C0 + sdc*bs;
+
+ c_00 = beta[0]*C0[3+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C1[0+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C1[1+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C1[2+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C0[3+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C1[0+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C1[1+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C1[2+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C0[3+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C1[0+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C1[1+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C1[2+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C0[3+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C1[0+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C1[1+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C1[2+bs*3] + alpha[0]*c_33;
+ }
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nt_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER)
+void kernel_sgemm_nt_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+ kernel_sgemm_nt_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nn_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[1];
+ b_1 = B[5];
+ b_2 = B[9];
+ b_3 = B[13];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[2];
+ b_1 = B[6];
+ b_2 = B[10];
+ b_3 = B[14];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[3];
+ b_1 = B[7];
+ b_2 = B[11];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_nn_4x4_lib4(int kmax, float *alpha, float *A, float *B, int sdb, float *beta, float *C, float *D)
+ {
+ kernel_sgemm_nn_4x4_vs_lib4(kmax, alpha, A, B, sdb, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_nt_l_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, //c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, //c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, //c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+// c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+// c_02 += a_0 * b_2;
+// c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+// c_03 += a_0 * b_3;
+// c_13 += a_1 * b_3;
+// c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+// c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+// c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+// c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+// c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+// c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+// c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+// if(kn==1)
+// return;
+
+// D[0+bs*1] = c_01;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_nt_l_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+ kernel_ssyrk_nt_l_4x4_vs_lib4(kmax, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nt_ru_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ k = 0;
+
+ // k = 0
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ // k = 1
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ // k = 2
+ if(kmax>0)
+ {
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 4;
+ k++;
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = beta[0]*C[0+bs*0] + alpha[0]*c_00;
+ c_10 = beta[0]*C[1+bs*0] + alpha[0]*c_10;
+ c_20 = beta[0]*C[2+bs*0] + alpha[0]*c_20;
+ c_30 = beta[0]*C[3+bs*0] + alpha[0]*c_30;
+
+ c_01 = beta[0]*C[0+bs*1] + alpha[0]*c_01;
+ c_11 = beta[0]*C[1+bs*1] + alpha[0]*c_11;
+ c_21 = beta[0]*C[2+bs*1] + alpha[0]*c_21;
+ c_31 = beta[0]*C[3+bs*1] + alpha[0]*c_31;
+
+ c_02 = beta[0]*C[0+bs*2] + alpha[0]*c_02;
+ c_12 = beta[0]*C[1+bs*2] + alpha[0]*c_12;
+ c_22 = beta[0]*C[2+bs*2] + alpha[0]*c_22;
+ c_32 = beta[0]*C[3+bs*2] + alpha[0]*c_32;
+
+ c_03 = beta[0]*C[0+bs*3] + alpha[0]*c_03;
+ c_13 = beta[0]*C[1+bs*3] + alpha[0]*c_13;
+ c_23 = beta[0]*C[2+bs*3] + alpha[0]*c_23;
+ c_33 = beta[0]*C[3+bs*3] + alpha[0]*c_33;
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nt_ru_4x4_lib4(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+ kernel_strmm_nt_ru_4x4_vs_lib4(k, alpha, A, B, beta, C, D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nn_rl_4x4_gen_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ float *D1;
+
+ int k;
+
+ B += offsetB;
+
+ k = 0;
+
+ if(offsetB==0)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else if(offsetB==1)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else if(offsetB==2)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 4
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 5
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+ else // if(offetB==3)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 1
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 2
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 3
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+ k += 1;
+
+ if(k>=kmax)
+ goto store;
+
+ // k = 4
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ b_1 = B[4];
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ b_2 = B[8];
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ b_3 = B[12];
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 4*sdb-3;
+ k += 1;
+
+ }
+
+ for(; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[1];
+ b_1 = B[5];
+ b_2 = B[9];
+ b_3 = B[13];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[2];
+ b_1 = B[6];
+ b_2 = B[10];
+ b_3 = B[14];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[3];
+ b_1 = B[7];
+ b_2 = B[11];
+ b_3 = B[15];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[4];
+ b_2 = B[8];
+ b_3 = B[12];
+
+ c_00 += a_0 * b_0;
+ c_10 += a_1 * b_0;
+ c_20 += a_2 * b_0;
+ c_30 += a_3 * b_0;
+
+ c_01 += a_0 * b_1;
+ c_11 += a_1 * b_1;
+ c_21 += a_2 * b_1;
+ c_31 += a_3 * b_1;
+
+ c_02 += a_0 * b_2;
+ c_12 += a_1 * b_2;
+ c_22 += a_2 * b_2;
+ c_32 += a_3 * b_2;
+
+ c_03 += a_0 * b_3;
+ c_13 += a_1 * b_3;
+ c_23 += a_2 * b_3;
+ c_33 += a_3 * b_3;
+
+ A += 4;
+ B += 1;
+
+ }
+
+ store:
+
+ c_00 = alpha[0]*c_00;
+ c_10 = alpha[0]*c_10;
+ c_20 = alpha[0]*c_20;
+ c_30 = alpha[0]*c_30;
+
+ c_01 = alpha[0]*c_01;
+ c_11 = alpha[0]*c_11;
+ c_21 = alpha[0]*c_21;
+ c_31 = alpha[0]*c_31;
+
+ c_02 = alpha[0]*c_02;
+ c_12 = alpha[0]*c_12;
+ c_22 = alpha[0]*c_22;
+ c_32 = alpha[0]*c_32;
+
+ c_03 = alpha[0]*c_03;
+ c_13 = alpha[0]*c_13;
+ c_23 = alpha[0]*c_23;
+ c_33 = alpha[0]*c_33;
+
+ // shift sol for cols
+ if(n0>0)
+ {
+ if(n0==1)
+ {
+ c_00 = c_01;
+ c_10 = c_11;
+ c_20 = c_21;
+ c_30 = c_31;
+
+ c_01 = c_02;
+ c_11 = c_12;
+ c_21 = c_22;
+ c_31 = c_32;
+
+ c_02 = c_03;
+ c_12 = c_13;
+ c_22 = c_23;
+ c_32 = c_33;
+
+ D0 += 1*bs;
+ }
+ else if(n0==2)
+ {
+ c_00 = c_02;
+ c_10 = c_12;
+ c_20 = c_22;
+ c_30 = c_32;
+
+ c_01 = c_03;
+ c_11 = c_13;
+ c_21 = c_23;
+ c_31 = c_33;
+
+ D0 += 2*bs;
+ }
+ else //if(n0==3)
+ {
+ c_00 = c_03;
+ c_10 = c_13;
+ c_20 = c_23;
+ c_30 = c_33;
+
+ D0 += 3*bs;
+ }
+ }
+
+ int kn = n1 - n0;
+
+ if(offsetD==0)
+ {
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[1+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[2+bs*0] = c_20;
+ if(m0<=3 & m1>3) D0[3+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[1+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[2+bs*1] = c_21;
+ if(m0<=3 & m1>3) D0[3+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[1+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[2+bs*2] = c_22;
+ if(m0<=3 & m1>3) D0[3+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[0+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[1+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[2+bs*3] = c_23;
+ if(m0<=3 & m1>3) D0[3+bs*3] = c_33;
+ }
+ else if(offsetD==1)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[2+bs*0] = c_10;
+ if(m0<=2 & m1>2) D0[3+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[0+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[2+bs*1] = c_11;
+ if(m0<=2 & m1>2) D0[3+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[0+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[2+bs*2] = c_12;
+ if(m0<=2 & m1>2) D0[3+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[0+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[1+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[2+bs*3] = c_13;
+ if(m0<=2 & m1>2) D0[3+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[0+bs*3] = c_33;
+ }
+ else if(offsetD==2)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*0] = c_00;
+ if(m0<=1 & m1>1) D0[3+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[0+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[1+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*1] = c_01;
+ if(m0<=1 & m1>1) D0[3+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[0+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[1+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*2] = c_02;
+ if(m0<=1 & m1>1) D0[3+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[0+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[1+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[2+bs*3] = c_03;
+ if(m0<=1 & m1>1) D0[3+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[0+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[1+bs*3] = c_33;
+ }
+ else //if(offsetD==3)
+ {
+ D1 = D0 + sdd*bs;
+
+ if(kn<=0)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*0] = c_00;
+ if(m0<=1 & m1>1) D1[0+bs*0] = c_10;
+ if(m0<=2 & m1>2) D1[1+bs*0] = c_20;
+ if(m0<=3 & m1>3) D1[2+bs*0] = c_30;
+
+ if(kn<=1)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*1] = c_01;
+ if(m0<=1 & m1>1) D1[0+bs*1] = c_11;
+ if(m0<=2 & m1>2) D1[1+bs*1] = c_21;
+ if(m0<=3 & m1>3) D1[2+bs*1] = c_31;
+
+ if(kn<=2)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*2] = c_02;
+ if(m0<=1 & m1>1) D1[0+bs*2] = c_12;
+ if(m0<=2 & m1>2) D1[1+bs*2] = c_22;
+ if(m0<=3 & m1>3) D1[2+bs*2] = c_32;
+
+ if(kn<=3)
+ return;
+
+ if(m0<=0 & m1>0) D0[3+bs*3] = c_03;
+ if(m0<=1 & m1>1) D1[0+bs*3] = c_13;
+ if(m0<=2 & m1>2) D1[1+bs*3] = c_23;
+ if(m0<=3 & m1>3) D1[2+bs*3] = c_33;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmm_nn_rl_4x4_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, float *D)
+ {
+ kernel_strmm_nn_rl_4x4_gen_lib4(kmax, alpha, A, offsetB, B, sdb, 0, D, 0, 0, 4, 0, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_spotrf_nt_l_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, //c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, //c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, //c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+// c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+// c_02 -= a_0 * b_2;
+// c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+// c_03 -= a_0 * b_3;
+// c_13 -= a_1 * b_3;
+// c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+// c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+// c_02 = C[0+bs*2] + c_02;
+// c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+// c_03 = C[0+bs*3] + c_03;
+// c_13 = C[1+bs*3] + c_13;
+// c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ if(c_00>0)
+ {
+ c_00 = sqrt(c_00);
+ tmp = 1.0/c_00;
+ }
+ else
+ {
+ c_00 = 0.0;
+ tmp = 0.0;
+ }
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+ inv_diag_D[0] = tmp;
+
+ if(kn==1)
+ goto store;
+
+ c_11 -= c_10 * c_10;
+ c_21 -= c_20 * c_10;
+ c_31 -= c_30 * c_10;
+ if(c_11>0)
+ {
+ c_11 = sqrt(c_11);
+ tmp = 1.0/c_11;
+ }
+ else
+ {
+ c_11 = 0.0;
+ tmp = 0.0;
+ }
+ c_21 *= tmp;
+ c_31 *= tmp;
+ inv_diag_D[1] = tmp;
+
+ if(kn==2)
+ goto store;
+
+ c_22 -= c_20 * c_20;
+ c_32 -= c_30 * c_20;
+ c_22 -= c_21 * c_21;
+ c_32 -= c_31 * c_21;
+ if(c_22>0)
+ {
+ c_22 = sqrt(c_22);
+ tmp = 1.0/c_22;
+ }
+ else
+ {
+ c_22 = 0.0;
+ tmp = 0.0;
+ }
+ c_32 *= tmp;
+ inv_diag_D[2] = tmp;
+
+ if(kn==3)
+ goto store;
+
+ c_33 -= c_30 * c_30;
+ c_33 -= c_31 * c_31;
+ c_33 -= c_32 * c_32;
+ if(c_33>0)
+ {
+ c_33 = sqrt(c_33);
+ tmp = 1.0/c_33;
+ }
+ else
+ {
+ c_33 = 0.0;
+ tmp = 0.0;
+ }
+ inv_diag_D[3] = tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+// D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+// D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+// D[1+bs*2] = c_12;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+// D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+// if(kn==1)
+// return;
+
+// D[0+bs*1] = c_01;
+
+// if(kn==2)
+// return;
+
+// D[0+bs*2] = c_02;
+
+// if(kn==3)
+// return;
+
+// D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_spotrf_nt_l_4x4_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D)
+ {
+ kernel_spotrf_nt_l_4x4_vs_lib4(kmax, A, B, C, D, inv_diag_D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn)
+ {
+ float alpha = 1.0;
+ float beta = 1.0;
+ kernel_ssyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+ kernel_spotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_ssyrk_spotrf_nt_l_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D)
+ {
+ float alpha = 1.0;
+ float beta = 1.0;
+ kernel_ssyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+ kernel_spotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ tmp = inv_diag_E[0];
+ c_00 *= tmp;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+ if(kn==1)
+ goto store;
+
+ tmp = E[1+bs*0];
+ c_01 -= c_00 * tmp;
+ c_11 -= c_10 * tmp;
+ c_21 -= c_20 * tmp;
+ c_31 -= c_30 * tmp;
+ tmp = inv_diag_E[1];
+ c_01 *= tmp;
+ c_11 *= tmp;
+ c_21 *= tmp;
+ c_31 *= tmp;
+
+ if(kn==2)
+ goto store;
+
+ tmp = E[2+bs*0];
+ c_02 -= c_00 * tmp;
+ c_12 -= c_10 * tmp;
+ c_22 -= c_20 * tmp;
+ c_32 -= c_30 * tmp;
+ tmp = E[2+bs*1];
+ c_02 -= c_01 * tmp;
+ c_12 -= c_11 * tmp;
+ c_22 -= c_21 * tmp;
+ c_32 -= c_31 * tmp;
+ tmp = inv_diag_E[2];
+ c_02 *= tmp;
+ c_12 *= tmp;
+ c_22 *= tmp;
+ c_32 *= tmp;
+
+ if(kn==3)
+ goto store;
+
+ tmp = E[3+bs*0];
+ c_03 -= c_00 * tmp;
+ c_13 -= c_10 * tmp;
+ c_23 -= c_20 * tmp;
+ c_33 -= c_30 * tmp;
+ tmp = E[3+bs*1];
+ c_03 -= c_01 * tmp;
+ c_13 -= c_11 * tmp;
+ c_23 -= c_21 * tmp;
+ c_33 -= c_31 * tmp;
+ tmp = E[3+bs*2];
+ c_03 -= c_02 * tmp;
+ c_13 -= c_12 * tmp;
+ c_23 -= c_22 * tmp;
+ c_33 -= c_32 * tmp;
+ tmp = inv_diag_E[3];
+ c_03 *= tmp;
+ c_13 *= tmp;
+ c_23 *= tmp;
+ c_33 *= tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+ float alpha = 1.0;
+ float beta = 1.0;
+ kernel_sgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
+ kernel_strsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, D, D, E, inv_diag_E, km, kn);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ float alpha = 1.0;
+ float beta = 1.0;
+ kernel_sgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
+ kernel_strsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, D, D, E, inv_diag_E);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_one_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+ if(kn==1)
+ goto store;
+
+ tmp = E[1+bs*0];
+ c_01 -= c_00 * tmp;
+ c_11 -= c_10 * tmp;
+ c_21 -= c_20 * tmp;
+ c_31 -= c_30 * tmp;
+
+ if(kn==2)
+ goto store;
+
+ tmp = E[2+bs*0];
+ c_02 -= c_00 * tmp;
+ c_12 -= c_10 * tmp;
+ c_22 -= c_20 * tmp;
+ c_32 -= c_30 * tmp;
+ tmp = E[2+bs*1];
+ c_02 -= c_01 * tmp;
+ c_12 -= c_11 * tmp;
+ c_22 -= c_21 * tmp;
+ c_32 -= c_31 * tmp;
+
+ if(kn==3)
+ goto store;
+
+ tmp = E[3+bs*0];
+ c_03 -= c_00 * tmp;
+ c_13 -= c_10 * tmp;
+ c_23 -= c_20 * tmp;
+ c_33 -= c_30 * tmp;
+ tmp = E[3+bs*1];
+ c_03 -= c_01 * tmp;
+ c_13 -= c_11 * tmp;
+ c_23 -= c_21 * tmp;
+ c_33 -= c_31 * tmp;
+ tmp = E[3+bs*2];
+ c_03 -= c_02 * tmp;
+ c_13 -= c_12 * tmp;
+ c_23 -= c_22 * tmp;
+ c_33 -= c_32 * tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_rl_one_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E)
+ {
+ kernel_strsm_nt_rl_one_4x4_vs_lib4(k, A, B, C, D, E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ float
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ tmp,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ int k;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 1
+
+ a_0 = A[4];
+ a_1 = A[5];
+ a_2 = A[6];
+ a_3 = A[7];
+
+ b_0 = B[4];
+ b_1 = B[5];
+ b_2 = B[6];
+ b_3 = B[7];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 2
+
+ a_0 = A[8];
+ a_1 = A[9];
+ a_2 = A[10];
+ a_3 = A[11];
+
+ b_0 = B[8];
+ b_1 = B[9];
+ b_2 = B[10];
+ b_3 = B[11];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ // k = 3
+
+ a_0 = A[12];
+ a_1 = A[13];
+ a_2 = A[14];
+ a_3 = A[15];
+
+ b_0 = B[12];
+ b_1 = B[13];
+ b_2 = B[14];
+ b_3 = B[15];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 16;
+ B += 16;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ // k = 0
+
+ a_0 = A[0];
+ a_1 = A[1];
+ a_2 = A[2];
+ a_3 = A[3];
+
+ b_0 = B[0];
+ b_1 = B[1];
+ b_2 = B[2];
+ b_3 = B[3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+ A += 4;
+ B += 4;
+
+ }
+
+ c_00 = C[0+bs*0] + c_00;
+ c_10 = C[1+bs*0] + c_10;
+ c_20 = C[2+bs*0] + c_20;
+ c_30 = C[3+bs*0] + c_30;
+
+ c_01 = C[0+bs*1] + c_01;
+ c_11 = C[1+bs*1] + c_11;
+ c_21 = C[2+bs*1] + c_21;
+ c_31 = C[3+bs*1] + c_31;
+
+ c_02 = C[0+bs*2] + c_02;
+ c_12 = C[1+bs*2] + c_12;
+ c_22 = C[2+bs*2] + c_22;
+ c_32 = C[3+bs*2] + c_32;
+
+ c_03 = C[0+bs*3] + c_03;
+ c_13 = C[1+bs*3] + c_13;
+ c_23 = C[2+bs*3] + c_23;
+ c_33 = C[3+bs*3] + c_33;
+
+
+ if(kn>3)
+ {
+ tmp = inv_diag_E[3];
+ c_03 *= tmp;
+ c_13 *= tmp;
+ c_23 *= tmp;
+ c_33 *= tmp;
+ tmp = E[2+bs*3];
+ c_02 -= c_03 * tmp;
+ c_12 -= c_13 * tmp;
+ c_22 -= c_23 * tmp;
+ c_32 -= c_33 * tmp;
+ tmp = E[1+bs*3];
+ c_01 -= c_03 * tmp;
+ c_11 -= c_13 * tmp;
+ c_21 -= c_23 * tmp;
+ c_31 -= c_33 * tmp;
+ tmp = E[0+bs*3];
+ c_00 -= c_03 * tmp;
+ c_10 -= c_13 * tmp;
+ c_20 -= c_23 * tmp;
+ c_30 -= c_33 * tmp;
+ }
+
+ if(kn>2)
+ {
+ tmp = inv_diag_E[2];
+ c_02 *= tmp;
+ c_12 *= tmp;
+ c_22 *= tmp;
+ c_32 *= tmp;
+ tmp = E[1+bs*2];
+ c_01 -= c_02 * tmp;
+ c_11 -= c_12 * tmp;
+ c_21 -= c_22 * tmp;
+ c_31 -= c_32 * tmp;
+ tmp = E[0+bs*2];
+ c_00 -= c_02 * tmp;
+ c_10 -= c_12 * tmp;
+ c_20 -= c_22 * tmp;
+ c_30 -= c_32 * tmp;
+ }
+
+ if(kn>1)
+ {
+ tmp = inv_diag_E[1];
+ c_01 *= tmp;
+ c_11 *= tmp;
+ c_21 *= tmp;
+ c_31 *= tmp;
+ tmp = E[0+bs*1];
+ c_00 -= c_01 * tmp;
+ c_10 -= c_11 * tmp;
+ c_20 -= c_21 * tmp;
+ c_30 -= c_31 * tmp;
+ }
+
+ tmp = inv_diag_E[0];
+ c_00 *= tmp;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nt_ru_inv_4x4_lib4(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ kernel_strsm_nt_ru_inv_4x4_vs_lib4(k, A, B, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_nn_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // factorization
+
+ // first column
+ tmp = 1.0 / c_00;
+ c_10 *= tmp;
+ c_20 *= tmp;
+ c_30 *= tmp;
+
+ inv_diag_D[0] = tmp;
+
+ if(kn==1)
+ goto store;
+
+ // second column
+ c_11 -= c_10 * c_01;
+ c_21 -= c_20 * c_01;
+ c_31 -= c_30 * c_01;
+
+ tmp = 1.0 / c_11;
+ c_21 *= tmp;
+ c_31 *= tmp;
+
+ inv_diag_D[1] = tmp;
+
+ if(kn==2)
+ goto store;
+
+ // third column
+ c_12 -= c_10 * c_02;
+ c_22 -= c_20 * c_02;
+ c_32 -= c_30 * c_02;
+
+ c_22 -= c_21 * c_12;
+ c_32 -= c_31 * c_12;
+
+ tmp = 1.0 / c_22;
+ c_32 *= tmp;
+
+ inv_diag_D[2] = tmp;
+
+ if(kn==3)
+ goto store;
+
+ // fourth column
+ c_13 -= c_10 * c_03;
+ c_23 -= c_20 * c_03;
+ c_33 -= c_30 * c_03;
+
+ c_23 -= c_21 * c_13;
+ c_33 -= c_31 * c_13;
+
+ c_33 -= c_32 * c_23;
+
+ tmp = 1.0 / c_33;
+
+ inv_diag_D[3] = tmp;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_nn_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D)
+ {
+ kernel_sgetrf_nn_4x4_vs_lib4(kmax, A, B, sdb, C, D, inv_diag_D, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ll_one_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_1, e_2, e_3,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // solution
+
+ if(km==1)
+ goto store;
+
+ e_1 = E[1+bs*0];
+ e_2 = E[2+bs*0];
+ e_3 = E[3+bs*0];
+ c_10 -= e_1 * c_00;
+ c_20 -= e_2 * c_00;
+ c_30 -= e_3 * c_00;
+ c_11 -= e_1 * c_01;
+ c_21 -= e_2 * c_01;
+ c_31 -= e_3 * c_01;
+ c_12 -= e_1 * c_02;
+ c_22 -= e_2 * c_02;
+ c_32 -= e_3 * c_02;
+ c_13 -= e_1 * c_03;
+ c_23 -= e_2 * c_03;
+ c_33 -= e_3 * c_03;
+
+ if(km==2)
+ goto store;
+
+ e_2 = E[2+bs*1];
+ e_3 = E[3+bs*1];
+ c_20 -= e_2 * c_10;
+ c_30 -= e_3 * c_10;
+ c_21 -= e_2 * c_11;
+ c_31 -= e_3 * c_11;
+ c_22 -= e_2 * c_12;
+ c_32 -= e_3 * c_12;
+ c_23 -= e_2 * c_13;
+ c_33 -= e_3 * c_13;
+
+ if(km==3)
+ goto store;
+
+ e_3 = E[3+bs*2];
+ c_30 -= e_3 * c_20;
+ c_31 -= e_3 * c_21;
+ c_32 -= e_3 * c_22;
+ c_33 -= e_3 * c_23;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ll_one_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E)
+ {
+ kernel_strsm_nn_ll_one_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_00, e_01, e_02, e_03,
+ e_11, e_12, e_13,
+ e_22, e_23,
+ e_33,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+ // solve
+
+ e_00 = inv_diag_E[0];
+ c_00 *= e_00;
+ c_10 *= e_00;
+ c_20 *= e_00;
+ c_30 *= e_00;
+
+ if(kn==1)
+ goto store;
+
+ e_01 = E[0+bs*1];
+ e_11 = inv_diag_E[1];
+ c_01 -= c_00 * e_01;
+ c_11 -= c_10 * e_01;
+ c_21 -= c_20 * e_01;
+ c_31 -= c_30 * e_01;
+ c_01 *= e_11;
+ c_11 *= e_11;
+ c_21 *= e_11;
+ c_31 *= e_11;
+
+ if(kn==2)
+ goto store;
+
+ e_02 = E[0+bs*2];
+ e_12 = E[1+bs*2];
+ e_22 = inv_diag_E[2];
+ c_02 -= c_00 * e_02;
+ c_12 -= c_10 * e_02;
+ c_22 -= c_20 * e_02;
+ c_32 -= c_30 * e_02;
+ c_02 -= c_01 * e_12;
+ c_12 -= c_11 * e_12;
+ c_22 -= c_21 * e_12;
+ c_32 -= c_31 * e_12;
+ c_02 *= e_22;
+ c_12 *= e_22;
+ c_22 *= e_22;
+ c_32 *= e_22;
+
+ if(kn==3)
+ goto store;
+
+ e_03 = E[0+bs*3];
+ e_13 = E[1+bs*3];
+ e_23 = E[2+bs*3];
+ e_33 = inv_diag_E[3];
+ c_03 -= c_00 * e_03;
+ c_13 -= c_10 * e_03;
+ c_23 -= c_20 * e_03;
+ c_33 -= c_30 * e_03;
+ c_03 -= c_01 * e_13;
+ c_13 -= c_11 * e_13;
+ c_23 -= c_21 * e_13;
+ c_33 -= c_31 * e_13;
+ c_03 -= c_02 * e_23;
+ c_13 -= c_12 * e_23;
+ c_23 -= c_22 * e_23;
+ c_33 -= c_32 * e_23;
+ c_03 *= e_33;
+ c_13 *= e_33;
+ c_23 *= e_33;
+ c_33 *= e_33;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_ru_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ kernel_strsm_nn_ru_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_lu_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ tmp,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ e_00, e_01, e_02, e_03,
+ e_11, e_12, e_13,
+ e_22, e_23,
+ e_33,
+ c_00=0, c_01=0, c_02=0, c_03=0,
+ c_10=0, c_11=0, c_12=0, c_13=0,
+ c_20=0, c_21=0, c_22=0, c_23=0,
+ c_30=0, c_31=0, c_32=0, c_33=0;
+
+ if(kmax<=0)
+ goto add;
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ b_0 = B[1+bs*0];
+ b_1 = B[1+bs*1];
+ b_2 = B[1+bs*2];
+ b_3 = B[1+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ b_0 = B[2+bs*0];
+ b_1 = B[2+bs*1];
+ b_2 = B[2+bs*2];
+ b_3 = B[2+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ b_0 = B[3+bs*0];
+ b_1 = B[3+bs*1];
+ b_2 = B[3+bs*2];
+ b_3 = B[3+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 16;
+ B += 4*sdb;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ b_0 = B[0+bs*0];
+ b_1 = B[0+bs*1];
+ b_2 = B[0+bs*2];
+ b_3 = B[0+bs*3];
+
+ c_00 -= a_0 * b_0;
+ c_10 -= a_1 * b_0;
+ c_20 -= a_2 * b_0;
+ c_30 -= a_3 * b_0;
+
+ c_01 -= a_0 * b_1;
+ c_11 -= a_1 * b_1;
+ c_21 -= a_2 * b_1;
+ c_31 -= a_3 * b_1;
+
+ c_02 -= a_0 * b_2;
+ c_12 -= a_1 * b_2;
+ c_22 -= a_2 * b_2;
+ c_32 -= a_3 * b_2;
+
+ c_03 -= a_0 * b_3;
+ c_13 -= a_1 * b_3;
+ c_23 -= a_2 * b_3;
+ c_33 -= a_3 * b_3;
+
+
+ A += 4;
+ B += 1;
+
+ }
+
+ add:
+
+ c_00 += C[0+bs*0];
+ c_10 += C[1+bs*0];
+ c_20 += C[2+bs*0];
+ c_30 += C[3+bs*0];
+
+ c_01 += C[0+bs*1];
+ c_11 += C[1+bs*1];
+ c_21 += C[2+bs*1];
+ c_31 += C[3+bs*1];
+
+ c_02 += C[0+bs*2];
+ c_12 += C[1+bs*2];
+ c_22 += C[2+bs*2];
+ c_32 += C[3+bs*2];
+
+ c_03 += C[0+bs*3];
+ c_13 += C[1+bs*3];
+ c_23 += C[2+bs*3];
+ c_33 += C[3+bs*3];
+
+// printf("\n%f %f %f %f\n", c_00, c_01, c_02, c_03);
+// printf("\n%f %f %f %f\n", c_10, c_11, c_12, c_13);
+// printf("\n%f %f %f %f\n", c_20, c_21, c_22, c_23);
+// printf("\n%f %f %f %f\n", c_30, c_31, c_32, c_33);
+
+ // solve
+
+ if(km>3)
+ {
+ e_03 = E[0+bs*3];
+ e_13 = E[1+bs*3];
+ e_23 = E[2+bs*3];
+ e_33 = inv_diag_E[3];
+ c_30 *= e_33;
+ c_31 *= e_33;
+ c_32 *= e_33;
+ c_33 *= e_33;
+ c_00 -= e_03 * c_30;
+ c_01 -= e_03 * c_31;
+ c_02 -= e_03 * c_32;
+ c_03 -= e_03 * c_33;
+ c_10 -= e_13 * c_30;
+ c_11 -= e_13 * c_31;
+ c_12 -= e_13 * c_32;
+ c_13 -= e_13 * c_33;
+ c_20 -= e_23 * c_30;
+ c_21 -= e_23 * c_31;
+ c_22 -= e_23 * c_32;
+ c_23 -= e_23 * c_33;
+ }
+
+ if(km>2)
+ {
+ e_02 = E[0+bs*2];
+ e_12 = E[1+bs*2];
+ e_22 = inv_diag_E[2];
+ c_20 *= e_22;
+ c_21 *= e_22;
+ c_22 *= e_22;
+ c_23 *= e_22;
+ c_00 -= e_02 * c_20;
+ c_01 -= e_02 * c_21;
+ c_02 -= e_02 * c_22;
+ c_03 -= e_02 * c_23;
+ c_10 -= e_12 * c_20;
+ c_11 -= e_12 * c_21;
+ c_12 -= e_12 * c_22;
+ c_13 -= e_12 * c_23;
+ }
+
+ if(km>1)
+ {
+ e_01 = E[0+bs*1];
+ e_11 = inv_diag_E[1];
+ c_10 *= e_11;
+ c_11 *= e_11;
+ c_12 *= e_11;
+ c_13 *= e_11;
+ c_00 -= e_01 * c_10;
+ c_01 -= e_01 * c_11;
+ c_02 -= e_01 * c_12;
+ c_03 -= e_01 * c_13;
+ }
+
+ e_00 = inv_diag_E[0];
+ c_00 *= e_00;
+ c_01 *= e_00;
+ c_02 *= e_00;
+ c_03 *= e_00;
+
+ store:
+
+ if(km>=4)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+ D[3+bs*0] = c_30;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+ D[3+bs*1] = c_31;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+ D[3+bs*2] = c_32;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ D[3+bs*3] = c_33;
+ }
+ else if(km>=3)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+ D[2+bs*0] = c_20;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+ D[2+bs*1] = c_21;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+ D[2+bs*2] = c_22;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ D[2+bs*3] = c_23;
+ }
+ else if(km>=2)
+ {
+ D[0+bs*0] = c_00;
+ D[1+bs*0] = c_10;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+ D[1+bs*1] = c_11;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+ D[1+bs*2] = c_12;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ D[1+bs*3] = c_13;
+ }
+ else //if(km>=1)
+ {
+ D[0+bs*0] = c_00;
+
+ if(kn==1)
+ return;
+
+ D[0+bs*1] = c_01;
+
+ if(kn==2)
+ return;
+
+ D[0+bs*2] = c_02;
+
+ if(kn==3)
+ return;
+
+ D[0+bs*3] = c_03;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsm_nn_lu_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E)
+ {
+ kernel_strsm_nn_lu_inv_4x4_vs_lib4(kmax, A, B, sdb, C, D, E, inv_diag_E, 4, 4);
+ }
+#endif
+
diff --git a/kernel/c99/kernel_sgemm_diag_lib4.c b/kernel/c99/kernel_sgemm_diag_lib4.c
new file mode 100644
index 0000000..93df707
--- /dev/null
+++ b/kernel/c99/kernel_sgemm_diag_lib4.c
@@ -0,0 +1,1112 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// B is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+ b_3 = alpha0 * B[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_0;
+ c_2 = a_2 * b_0;
+ c_3 = a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = a_0 * b_1;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_1;
+ c_3 = a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = a_0 * b_2;
+ c_1 = a_1 * b_2;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ c_0 = a_0 * b_3;
+ c_1 = a_1 * b_3;
+ c_2 = a_2 * b_3;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ A += 4*sda;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ a_0 = A[0+bs*3];
+
+ c_0 = a_0 * b_3;
+
+ D[0+bs*3] = c_0;
+
+
+ A += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+ b_3 = alpha0 * B[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ a_0 = A[0+bs*3];
+ a_1 = A[1+bs*3];
+ a_2 = A[2+bs*3];
+ a_3 = A[3+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_3;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_3;
+ c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ a_0 = A[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
+
+ D[0+bs*3] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+ b_2 = alpha0 * B[2];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ a_0 = A[0+bs*2];
+ a_1 = A[1+bs*2];
+ a_2 = A[2+bs*2];
+ a_3 = A[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ a_0 = A[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
+
+ D[0+bs*2] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+ b_1 = alpha0 * B[1];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ a_0 = A[0+bs*1];
+ a_1 = A[1+bs*1];
+ a_2 = A[2+bs*1];
+ a_3 = A[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ a_0 = A[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
+
+ D[0+bs*1] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// B is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ b_0 = alpha0 * B[0];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ a_0 = A[0+bs*0];
+ a_1 = A[1+bs*0];
+ a_2 = A[2+bs*0];
+ a_3 = A[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ A += 4*sda;
+ C += 4*sdc;
+ D += 4*sdd;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ a_0 = A[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ A += 1;
+ C += 1;
+ D += 1;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix, case beta=0.0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_4_a0_lib4(int kmax, float *alpha, float *A, float *B, float *D, int alg)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+ a_3 = alpha0 * A[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+ b_3 = B[3+bs*1];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+ b_3 = B[3+bs*2];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+ b_3 = B[3+bs*3];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ B += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = a_0 * b_0;
+ c_1 = a_1 * b_1;
+ c_2 = a_2 * b_2;
+ c_3 = a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+ B += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int alg)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2, a_3,
+ b_0, b_1, b_2, b_3,
+ c_0, c_1, c_2, c_3;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+ a_3 = alpha0 * A[3];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+ b_3 = B[3+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*1] + a_3 * b_3;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+ D[3+bs*1] = c_3;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+ b_3 = B[3+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*2] + a_3 * b_3;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+ D[3+bs*2] = c_3;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+ b_3 = B[3+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+ D[3+bs*3] = c_3;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+ b_3 = B[3+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+ c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+ D[3+bs*0] = c_3;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_3_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1, a_2,
+ b_0, b_1, b_2,
+ c_0, c_1, c_2;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+ a_2 = alpha0 * A[2];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+ b_2 = B[2+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+ D[2+bs*1] = c_2;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+ b_2 = B[2+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+ D[2+bs*2] = c_2;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+ b_2 = B[2+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+ D[2+bs*3] = c_2;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+ b_2 = B[2+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+ c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+ D[2+bs*0] = c_2;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_2_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0, a_1,
+ b_0, b_1,
+ c_0, c_1;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+ a_1 = alpha0 * A[1];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+
+
+ b_0 = B[0+bs*1];
+ b_1 = B[1+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
+
+ D[0+bs*1] = c_0;
+ D[1+bs*1] = c_1;
+
+
+ b_0 = B[0+bs*2];
+ b_1 = B[1+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
+
+ D[0+bs*2] = c_0;
+ D[1+bs*2] = c_1;
+
+
+ b_0 = B[0+bs*3];
+ b_1 = B[1+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
+
+ D[0+bs*3] = c_0;
+ D[1+bs*3] = c_1;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+ b_1 = B[1+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+ c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
+
+ D[0+bs*0] = c_0;
+ D[1+bs*0] = c_1;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
+// A is the diagonal of a matrix
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemm_diag_left_1_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ alpha0, beta0,
+ a_0,
+ b_0,
+ c_0;
+
+ alpha0 = alpha[0];
+ beta0 = beta[0];
+
+ a_0 = alpha0 * A[0];
+
+ for(k=0; k<kmax-3; k+=4)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+
+ b_0 = B[0+bs*1];
+
+ c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
+
+ D[0+bs*1] = c_0;
+
+
+ b_0 = B[0+bs*2];
+
+ c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
+
+ D[0+bs*2] = c_0;
+
+
+ b_0 = B[0+bs*3];
+
+ c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
+
+ D[0+bs*3] = c_0;
+
+ B += 16;
+ C += 16;
+ D += 16;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ b_0 = B[0+bs*0];
+
+ c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
+
+ D[0+bs*0] = c_0;
+
+ B += 4;
+ C += 4;
+ D += 4;
+
+ }
+
+ }
+#endif
+
+
+
diff --git a/kernel/c99/kernel_sgemv_4_lib4.c b/kernel/c99/kernel_sgemv_4_lib4.c
new file mode 100644
index 0000000..03975f4
--- /dev/null
+++ b/kernel/c99/kernel_sgemv_4_lib4.c
@@ -0,0 +1,1010 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_gen_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k0, int k1)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ x_0,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ x_0 = x[1];
+
+ y_0 += A[0+bs*1] * x_0;
+ y_1 += A[1+bs*1] * x_0;
+ y_2 += A[2+bs*1] * x_0;
+ y_3 += A[3+bs*1] * x_0;
+
+ x_0 = x[2];
+
+ y_0 += A[0+bs*2] * x_0;
+ y_1 += A[1+bs*2] * x_0;
+ y_2 += A[2+bs*2] * x_0;
+ y_3 += A[3+bs*2] * x_0;
+
+ x_0 = x[3];
+
+ y_0 += A[0+bs*3] * x_0;
+ y_1 += A[1+bs*3] * x_0;
+ y_2 += A[2+bs*3] * x_0;
+ y_3 += A[3+bs*3] * x_0;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ A += 1*bs;
+ x += 1;
+
+ }
+
+ y_0 = alpha[0]*y_0 + beta[0]*y[0];
+ y_1 = alpha[0]*y_1 + beta[0]*y[1];
+ y_2 = alpha[0]*y_2 + beta[0]*y[2];
+ y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+ if(k0<=0 & k1>3)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ if(k0<=0 & k1>0) z[0] = y_0;
+ if(k0<=1 & k1>1) z[1] = y_1;
+ if(k0<=2 & k1>2) z[2] = y_2;
+ if(k0<=3 & k1>3) z[3] = y_3;
+ }
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z)
+ {
+
+ kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, 4);
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_n_4_vs_lib4(int kmax, float *alpha, float *A, float *x, float *beta, float *y, float *z, int k1)
+ {
+
+ kernel_sgemv_n_4_gen_lib4(kmax, alpha, A, x, beta, y, z, 0, k1);
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x, float *beta, float *y, float *z, int km)
+ {
+
+ const int bs = 4;
+
+ int k, kend;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ if(offA!=0) // 1, 2, 3
+ {
+ kend = 4-offA<kmax ? 4-offA : kmax;
+ for(; k<kend; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ A += 1;
+ x += 1;
+
+ }
+ A += bs*(sda-1);
+ }
+ for(; k<kmax-bs+1; k+=bs)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ y_0 += A[1+bs*0] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+ y_0 += A[2+bs*0] * x_2;
+ y_1 += A[2+bs*1] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+ y_0 += A[3+bs*0] * x_3;
+ y_1 += A[3+bs*1] * x_3;
+ y_2 += A[3+bs*2] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ A += 1;
+ x += 1;
+
+ }
+
+ y_0 = alpha[0]*y_0 + beta[0]*y[0];
+ y_1 = alpha[0]*y_1 + beta[0]*y[1];
+ y_2 = alpha[0]*y_2 + beta[0]*y[2];
+ y_3 = alpha[0]*y_3 + beta[0]*y[3];
+
+ if(km>=4)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ z[0] = y_0;
+ if(km>=2)
+ {
+ z[1] = y_1;
+ if(km>2)
+ {
+ z[2] = y_2;
+ }
+ }
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z)
+ {
+
+ kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, 4);
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgemv_t_4_vs_lib4(int kmax, float *alpha, float *A, int sda, float *x, float *beta, float *y, float *z, int k1)
+ {
+
+ kernel_sgemv_t_4_gen_lib4(kmax, alpha, 0, A, sda, x, beta, y, z, k1);
+
+ }
+#endif
+
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_ln_inv_4_vs_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z, int km, int kn)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[1+bs*0] * x_0;
+ y_2 -= A[2+bs*0] * x_0;
+ y_3 -= A[3+bs*0] * x_0;
+
+ y_0 -= A[0+bs*1] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[2+bs*1] * x_1;
+ y_3 -= A[3+bs*1] * x_1;
+
+ y_0 -= A[0+bs*2] * x_2;
+ y_1 -= A[1+bs*2] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+ y_3 -= A[3+bs*2] * x_2;
+
+ y_0 -= A[0+bs*3] * x_3;
+ y_1 -= A[1+bs*3] * x_3;
+ y_2 -= A[2+bs*3] * x_3;
+ y_3 -= A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+ y_3 = y[3] + y_3;
+
+ float
+ a_00, a_10, a_20, a_30,
+ a_11, a_21, a_31;
+
+ // a_00
+ a_00 = inv_diag_A[0];
+ a_10 = A[1+bs*0];
+ a_20 = A[2+bs*0];
+ a_30 = A[3+bs*0];
+ y_0 *= a_00;
+ z[0] = y_0;
+ y_1 -= a_10 * y_0;
+ y_2 -= a_20 * y_0;
+ y_3 -= a_30 * y_0;
+
+ if(kn==1)
+ {
+ if(km==1)
+ return;
+ y[1] = y_1;
+ if(km==2)
+ return;
+ y[2] = y_2;
+ if(km==3)
+ return;
+ y[3] = y_3;
+ return;
+ }
+
+ // a_11
+ a_11 = inv_diag_A[1];
+ a_21 = A[2+bs*1];
+ a_31 = A[3+bs*1];
+ y_1 *= a_11;
+ z[1] = y_1;
+ y_2 -= a_21 * y_1;
+ y_3 -= a_31 * y_1;
+
+ if(kn==2)
+ {
+ if(km==2)
+ return;
+ y[2] = y_2;
+ if(km==3)
+ return;
+ y[3] = y_3;
+ return;
+ }
+
+ // a_22
+ a_00 = inv_diag_A[2];
+ a_10 = A[3+bs*2];
+ y_2 *= a_00;
+ z[2] = y_2;
+ y_3 -= a_10 * y_2;
+
+ if(kn==3)
+ {
+ if(km==3)
+ return;
+ y[3] = y_3;
+
+ return;
+ }
+
+ // a_33
+ a_11 = inv_diag_A[3];
+ y_3 *= a_11;
+ z[3] = y_3;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_ln_inv_4_lib4(int kmax, float *A, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ kernel_strsv_ln_inv_4_vs_lib4(kmax, A, inv_diag_A, x, y, z, 4, 4);
+
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_4_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float *tA, *tx;
+ tA = A;
+ tx = x;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+ y_3 -= A[0+bs*3] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[1+bs*2] * x_1;
+ y_3 -= A[1+bs*3] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+ y_3 -= A[2+bs*3] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+ y_3 -= A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+ y_3 -= A[0+bs*3] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+ y_3 = y[3] + y_3;
+
+ A = tA;
+ x = tx;
+
+ // bottom trinagle
+ y_3 *= inv_diag_A[3];
+ z[3] = y_3;
+
+ y_2 -= A[3+bs*2] * y_3;
+ y_2 *= inv_diag_A[2];
+ z[2] = y_2;
+
+ // square
+ y_0 -= A[2+bs*0]*y_2 + A[3+bs*0]*y_3;
+ y_1 -= A[2+bs*1]*y_2 + A[3+bs*1]*y_3;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_3_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float *tA, *tx;
+ tA = A;
+ tx = x;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0;
+
+ k = 3;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_3 = x[3];
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+ y_2 -= A[1+bs*2] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+ y_2 -= A[2+bs*2] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+ y_2 -= A[3+bs*2] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 3;
+ x += 1;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+ y_2 -= A[0+bs*2] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+ y_2 = y[2] + y_2;
+
+ A = tA;
+ x = tx;
+
+ // bottom trinagle
+ y_2 *= inv_diag_A[2];
+ z[2] = y_2;
+
+ // square
+ y_0 -= A[2+bs*0]*y_2;
+ y_1 -= A[2+bs*1]*y_2;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_2_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float *tA, *tx;
+ tA = A;
+ tx = x;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0;
+
+ k = 2;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_1 -= A[1+bs*1] * x_1;
+
+ y_0 -= A[2+bs*0] * x_2;
+ y_1 -= A[2+bs*1] * x_2;
+
+ y_0 -= A[3+bs*0] * x_3;
+ y_1 -= A[3+bs*1] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 2;
+ x += 2;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_1 -= A[0+bs*1] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+ y_1 = y[1] + y_1;
+
+ A = tA;
+ x = tx;
+
+ // top trinagle
+ y_1 *= inv_diag_A[1];
+ z[1] = y_1;
+
+ y_0 -= A[1+bs*0] * y_1;
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strsv_lt_inv_1_lib4(int kmax, float *A, int sda, float *inv_diag_A, float *x, float *y, float *z)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float *tA, *tx;
+ tA = A;
+ tx = x;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0;
+
+ k = 1;
+ if(kmax>4)
+ {
+ // clean up at the beginning
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[1+bs*0] * x_1;
+ y_0 -= A[2+bs*0] * x_2;
+ y_0 -= A[3+bs*0] * x_3;
+
+ k=4;
+ A += 4 + (sda-1)*bs;
+ x += 4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 -= A[0+bs*0] * x_0;
+ y_0 -= A[1+bs*0] * x_1;
+ y_0 -= A[2+bs*0] * x_2;
+ y_0 -= A[3+bs*0] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+ }
+ else
+ {
+ A += 1;
+ x += 1;
+ }
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 -= A[0+bs*0] * x_0;
+
+ A += 1;//sda*bs;
+ x += 1;
+
+ }
+
+ y_0 = y[0] + y_0;
+
+ A = tA;
+ x = tx;
+
+ // top trinagle
+ y_0 *= inv_diag_A[0];
+ z[0] = y_0;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_un_4_lib4(int kmax, float *A, float *x, float *z)
+ {
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+/* y_1 += A[1+bs*0] * x_0;*/
+/* y_2 += A[2+bs*0] * x_0;*/
+/* y_3 += A[3+bs*0] * x_0;*/
+
+ y_0 += A[0+bs*1] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+/* y_2 += A[2+bs*1] * x_1;*/
+/* y_3 += A[3+bs*1] * x_1;*/
+
+ y_0 += A[0+bs*2] * x_2;
+ y_1 += A[1+bs*2] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+/* y_3 += A[3+bs*2] * x_2;*/
+
+ y_0 += A[0+bs*3] * x_3;
+ y_1 += A[1+bs*3] * x_3;
+ y_2 += A[2+bs*3] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ k=4;
+ for(; k<kmax-3; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ y_0 += A[0+bs*1] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[2+bs*1] * x_1;
+ y_3 += A[3+bs*1] * x_1;
+
+ y_0 += A[0+bs*2] * x_2;
+ y_1 += A[1+bs*2] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[3+bs*2] * x_2;
+
+ y_0 += A[0+bs*3] * x_3;
+ y_1 += A[1+bs*3] * x_3;
+ y_2 += A[2+bs*3] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += 4*bs;
+ x += 4;
+
+ }
+
+ for(; k<kmax; k++)
+ {
+
+ x_0 = x[0];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[1+bs*0] * x_0;
+ y_2 += A[2+bs*0] * x_0;
+ y_3 += A[3+bs*0] * x_0;
+
+ A += 1*bs;
+ x += 1;
+
+ }
+
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_ut_4_vs_lib4(int kmax, float *A, int sda, float *x, float *z, int km)
+ {
+
+ const int bs = 4;
+
+ int
+ k;
+
+ float
+ x_0, x_1, x_2, x_3,
+ y_0=0, y_1=0, y_2=0, y_3=0;
+
+ k=0;
+ for(; k<kmax-4; k+=4)
+ {
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+ y_0 += A[1+bs*0] * x_1;
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+ y_0 += A[2+bs*0] * x_2;
+ y_1 += A[2+bs*1] * x_2;
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+ y_0 += A[3+bs*0] * x_3;
+ y_1 += A[3+bs*1] * x_3;
+ y_2 += A[3+bs*2] * x_3;
+ y_3 += A[3+bs*3] * x_3;
+
+ A += sda*bs;
+ x += 4;
+
+ }
+
+ x_0 = x[0];
+ x_1 = x[1];
+ x_2 = x[2];
+ x_3 = x[3];
+
+ y_0 += A[0+bs*0] * x_0;
+ y_1 += A[0+bs*1] * x_0;
+ y_2 += A[0+bs*2] * x_0;
+ y_3 += A[0+bs*3] * x_0;
+
+/* y_0 += A[1+bs*0] * x_1;*/
+ y_1 += A[1+bs*1] * x_1;
+ y_2 += A[1+bs*2] * x_1;
+ y_3 += A[1+bs*3] * x_1;
+
+/* y_0 += A[2+bs*0] * x_2;*/
+/* y_1 += A[2+bs*1] * x_2;*/
+ y_2 += A[2+bs*2] * x_2;
+ y_3 += A[2+bs*3] * x_2;
+
+/* y_0 += A[3+bs*0] * x_3;*/
+/* y_1 += A[3+bs*1] * x_3;*/
+/* y_2 += A[3+bs*2] * x_3;*/
+ y_3 += A[3+bs*3] * x_3;
+
+// A += sda*bs;
+// x += 4;
+
+ // store_vs
+ store:
+ if(km>=4)
+ {
+ z[0] = y_0;
+ z[1] = y_1;
+ z[2] = y_2;
+ z[3] = y_3;
+ }
+ else
+ {
+ z[0] = y_0;
+ if(km>=2)
+ {
+ z[1] = y_1;
+ if(km>2)
+ {
+ z[2] = y_2;
+ }
+ }
+ }
+
+ }
+#endif
+
+
+
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_strmv_ut_4_lib4(int kmax, float *A, int sda, float *x, float *z)
+ {
+
+ kernel_strmv_ut_4_vs_lib4(kmax, A, sda, x, z, 4);
+
+ }
+#endif
+
+
+
+
+
+
diff --git a/kernel/c99/kernel_sgetrf_pivot_4_lib4.c b/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
new file mode 100644
index 0000000..fdec8de
--- /dev/null
+++ b/kernel/c99/kernel_sgetrf_pivot_4_lib4.c
@@ -0,0 +1,786 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "../../include/blasfeo_common.h"
+#include "../../include/blasfeo_s_aux.h"
+
+
+
+// C numbering, starting from 0
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void sidamax_lib4(int n, int offset, float *pA, int sda, int *p_idamax, float *p_amax)
+ {
+
+ int idamax, ii;
+ float tmp, amax;
+
+ p_idamax[0] = -1;
+ if(n<1)
+ return;
+
+ const int bs = 4;
+
+ int na = (bs - offset%bs)%bs;
+ na = n<na ? n : na;
+
+ amax = -1.0;
+ ii = 0;
+ if(na>0)
+ {
+ for( ; ii<na; ii++)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ pA += 1;
+ }
+ pA += bs*(sda-1);
+ }
+ for( ; ii<n-3; ii+=4)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ tmp = fabs(pA[1]);
+ if(tmp>amax)
+ {
+ idamax = ii+1;
+ amax = tmp;
+ }
+ tmp = fabs(pA[2]);
+ if(tmp>amax)
+ {
+ idamax = ii+2;
+ amax = tmp;
+ }
+ tmp = fabs(pA[3]);
+ if(tmp>amax)
+ {
+ idamax = ii+3;
+ amax = tmp;
+ }
+ pA += bs*sda;
+ }
+ for( ; ii<n; ii++)
+ {
+ tmp = fabs(pA[0]);
+ if(tmp>amax)
+ {
+ idamax = ii+0;
+ amax = tmp;
+ }
+ pA += 1;
+ }
+
+ p_amax[0] = amax;
+ p_idamax[0] = idamax;
+
+ return;
+
+ }
+#endif
+
+
+
+// C numering (starting from zero) in the ipiv
+// it process m>=4 rows and 4 cols
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_pivot_4_lib4(int m, float *pA, int sda, float *inv_diag_A, int* ipiv)
+ {
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ float
+ tmp0, tmp1, tmp2, tmp3,
+ u_00, u_01, u_02, u_03,
+ u_11, u_12, u_13,
+ u_22, u_23,
+ u_33;
+
+ float
+ *pB;
+
+ int
+ k, idamax;
+
+ // first column
+ sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ tmp0 = 1.0 / pA[0+bs*0];
+ inv_diag_A[0] = tmp0;
+ pA[1+bs*0] *= tmp0;
+ pA[2+bs*0] *= tmp0;
+ pA[3+bs*0] *= tmp0;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB[1+bs*0] *= tmp0;
+ pB[2+bs*0] *= tmp0;
+ pB[3+bs*0] *= tmp0;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[0] = 0.0;
+ }
+
+ // second column
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp2 = pA[2+bs*1];
+ tmp3 = pA[3+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ tmp2 -= pA[2+bs*0] * u_01;
+ tmp3 -= pA[3+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ pA[2+bs*1] = tmp2;
+ pA[3+bs*1] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp1 = pB[1+bs*1];
+ tmp2 = pB[2+bs*1];
+ tmp3 = pB[3+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ tmp1 -= pB[1+bs*0] * u_01;
+ tmp2 -= pB[2+bs*0] * u_01;
+ tmp3 -= pB[3+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB[1+bs*1] = tmp1;
+ pB[2+bs*1] = tmp2;
+ pB[3+bs*1] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB += 1;
+ }
+
+ sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+ ipiv[1] = idamax+1;
+ if(tmp1!=0)
+ {
+ if(ipiv[1]!=1)
+ srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ tmp1 = 1.0 / pA[1+bs*1];
+ inv_diag_A[1] = tmp1;
+ pA[2+bs*1] *= tmp1;
+ pA[3+bs*1] *= tmp1;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB[1+bs*1] *= tmp1;
+ pB[2+bs*1] *= tmp1;
+ pB[3+bs*1] *= tmp1;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[1] = 0.0;
+ }
+
+ // third column
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ tmp2 = pA[2+bs*2];
+ tmp3 = pA[3+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp3 -= pA[3+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ tmp3 -= pA[3+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ pA[3+bs*2] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp1 = pB[1+bs*2];
+ tmp2 = pB[2+bs*2];
+ tmp3 = pB[3+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp1 -= pB[1+bs*0] * u_02;
+ tmp2 -= pB[2+bs*0] * u_02;
+ tmp3 -= pB[3+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ tmp1 -= pB[1+bs*1] * u_12;
+ tmp2 -= pB[2+bs*1] * u_12;
+ tmp3 -= pB[3+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB[1+bs*2] = tmp1;
+ pB[2+bs*2] = tmp2;
+ pB[3+bs*2] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB += 1;
+ }
+
+ sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+ ipiv[2] = idamax+2;
+ if(tmp2!=0)
+ {
+ if(ipiv[2]!=2)
+ srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ tmp2 = 1.0 / pA[2+bs*2];
+ inv_diag_A[2] = tmp2;
+ pA[3+bs*2] *= tmp2;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB[1+bs*2] *= tmp2;
+ pB[2+bs*2] *= tmp2;
+ pB[3+bs*2] *= tmp2;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[2] = 0.0;
+ }
+
+ // fourth column
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ tmp3 = pA[3+bs*3];
+ tmp3 -= pA[3+bs*0] * u_03;
+ tmp3 -= pA[3+bs*1] * u_13;
+ tmp3 -= pA[3+bs*2] * u_23;
+ pA[3+bs*3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp1 = pB[1+bs*3];
+ tmp2 = pB[2+bs*3];
+ tmp3 = pB[3+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp1 -= pB[1+bs*0] * u_03;
+ tmp2 -= pB[2+bs*0] * u_03;
+ tmp3 -= pB[3+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp1 -= pB[1+bs*1] * u_13;
+ tmp2 -= pB[2+bs*1] * u_13;
+ tmp3 -= pB[3+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ tmp1 -= pB[1+bs*2] * u_23;
+ tmp2 -= pB[2+bs*2] * u_23;
+ tmp3 -= pB[3+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB[1+bs*3] = tmp1;
+ pB[2+bs*3] = tmp2;
+ pB[3+bs*3] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB += 1;
+ }
+
+ sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+ ipiv[3] = idamax+3;
+ if(tmp3!=0)
+ {
+ if(ipiv[3]!=3)
+ srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ tmp3 = 1.0 / pA[3+bs*3];
+ inv_diag_A[3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB[1+bs*3] *= tmp3;
+ pB[2+bs*3] *= tmp3;
+ pB[3+bs*3] *= tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[3] = 0.0;
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+// it process m>0 rows and 0<n<=4 cols
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
+void kernel_sgetrf_pivot_4_vs_lib4(int m, int n, float *pA, int sda, float *inv_diag_A, int* ipiv)
+ {
+
+ if(m<=0 || n<=0)
+ return;
+
+ const int bs = 4;
+
+ // assume m>=4
+ int ma = m-4;
+
+ float
+ tmp0, tmp1, tmp2, tmp3,
+ u_00, u_01, u_02, u_03,
+ u_11, u_12, u_13,
+ u_22, u_23,
+ u_33;
+
+ float
+ *pB;
+
+ int
+ k, idamax;
+
+ // first column
+
+ // find pivot & scale
+ sidamax_lib4(m-0, 0, &pA[0+bs*0], sda, &idamax, &tmp0);
+ ipiv[0] = idamax;
+ if(tmp0!=0.0)
+ {
+ if(ipiv[0]!=0)
+ srowsw_lib(4, pA+0, pA+ipiv[0]/bs*bs*sda+ipiv[0]%bs);
+
+ tmp0 = 1.0 / pA[0+bs*0];
+ inv_diag_A[0] = tmp0;
+ if(m>=4)
+ {
+ pA[1+bs*0] *= tmp0;
+ pA[2+bs*0] *= tmp0;
+ pA[3+bs*0] *= tmp0;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB[1+bs*0] *= tmp0;
+ pB[2+bs*0] *= tmp0;
+ pB[3+bs*0] *= tmp0;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*0] *= tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {1,2,3}
+ {
+ if(m>1)
+ {
+ pA[1+bs*0] *= tmp0;
+ if(m>2)
+ pA[2+bs*0] *= tmp0;
+ }
+ }
+ }
+ else
+ {
+ inv_diag_A[0] = 0.0;
+ }
+
+ if(n==1 || m==1) // XXX for the first row there is nothing to do, so we can return here
+ return;
+
+ // second column
+
+ // correct
+ if(m>=4)
+ {
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp2 = pA[2+bs*1];
+ tmp3 = pA[3+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ tmp2 -= pA[2+bs*0] * u_01;
+ tmp3 -= pA[3+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ pA[2+bs*1] = tmp2;
+ pA[3+bs*1] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp1 = pB[1+bs*1];
+ tmp2 = pB[2+bs*1];
+ tmp3 = pB[3+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ tmp1 -= pB[1+bs*0] * u_01;
+ tmp2 -= pB[2+bs*0] * u_01;
+ tmp3 -= pB[3+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB[1+bs*1] = tmp1;
+ pB[2+bs*1] = tmp2;
+ pB[3+bs*1] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*1];
+ tmp0 -= pB[0+bs*0] * u_01;
+ pB[0+bs*1] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_01 = pA[0+bs*1];
+ tmp1 = pA[1+bs*1];
+ tmp1 -= pA[1+bs*0] * u_01;
+ pA[1+bs*1] = tmp1;
+ if(m>2)
+ {
+ tmp2 = pA[2+bs*1];
+ tmp2 -= pA[2+bs*0] * u_01;
+ pA[2+bs*1] = tmp2;
+ }
+ }
+
+ // find pivot & scale
+ sidamax_lib4(m-1, 1, &pA[1+bs*1], sda, &idamax, &tmp1);
+ ipiv[1] = idamax+1;
+ if(tmp1!=0)
+ {
+ if(ipiv[1]!=1)
+ srowsw_lib(4, pA+1, pA+ipiv[1]/bs*bs*sda+ipiv[1]%bs);
+
+ tmp1 = 1.0 / pA[1+bs*1];
+ inv_diag_A[1] = tmp1;
+ if(m>=4)
+ {
+ pA[2+bs*1] *= tmp1;
+ pA[3+bs*1] *= tmp1;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB[1+bs*1] *= tmp1;
+ pB[2+bs*1] *= tmp1;
+ pB[3+bs*1] *= tmp1;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*1] *= tmp1;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ if(m>2)
+ pA[2+bs*1] *= tmp1;
+ }
+ }
+ else
+ {
+ inv_diag_A[1] = 0.0;
+ }
+
+ if(n==2)
+ return;
+
+ // third column
+
+ // correct
+ if(m>=4)
+ {
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ tmp2 = pA[2+bs*2];
+ tmp3 = pA[3+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp3 -= pA[3+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ tmp3 -= pA[3+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ pA[3+bs*2] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp1 = pB[1+bs*2];
+ tmp2 = pB[2+bs*2];
+ tmp3 = pB[3+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp1 -= pB[1+bs*0] * u_02;
+ tmp2 -= pB[2+bs*0] * u_02;
+ tmp3 -= pB[3+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ tmp1 -= pB[1+bs*1] * u_12;
+ tmp2 -= pB[2+bs*1] * u_12;
+ tmp3 -= pB[3+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB[1+bs*2] = tmp1;
+ pB[2+bs*2] = tmp2;
+ pB[3+bs*2] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*2];
+ tmp0 -= pB[0+bs*0] * u_02;
+ tmp0 -= pB[0+bs*1] * u_12;
+ pB[0+bs*2] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_02 = pA[0+bs*2];
+ u_12 = pA[1+bs*2];
+ u_12 -= pA[1+bs*0] * u_02;
+ pA[1+bs*2] = u_12;
+ if(m>2)
+ {
+ tmp2 = pA[2+bs*2];
+ tmp2 -= pA[2+bs*0] * u_02;
+ tmp2 -= pA[2+bs*1] * u_12;
+ pA[2+bs*2] = tmp2;
+ }
+ }
+
+ // find pivot & scale
+ if(m>2)
+ {
+ sidamax_lib4(m-2, 2, &pA[2+bs*2], sda, &idamax, &tmp2);
+ ipiv[2] = idamax+2;
+ if(tmp2!=0)
+ {
+ if(ipiv[2]!=2)
+ srowsw_lib(4, pA+2, pA+ipiv[2]/bs*bs*sda+ipiv[2]%bs);
+
+ tmp2 = 1.0 / pA[2+bs*2];
+ inv_diag_A[2] = tmp2;
+ if(m>=4)
+ {
+ pA[3+bs*2] *= tmp2;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB[1+bs*2] *= tmp2;
+ pB[2+bs*2] *= tmp2;
+ pB[3+bs*2] *= tmp2;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*2] *= tmp2;
+ pB += 1;
+ }
+ }
+ }
+ else
+ {
+ inv_diag_A[2] = 0.0;
+ }
+ }
+
+ if(n<4)
+ return;
+
+ // fourth column
+
+ // correct
+ if(m>=4)
+ {
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ tmp3 = pA[3+bs*3];
+ tmp3 -= pA[3+bs*0] * u_03;
+ tmp3 -= pA[3+bs*1] * u_13;
+ tmp3 -= pA[3+bs*2] * u_23;
+ pA[3+bs*3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp1 = pB[1+bs*3];
+ tmp2 = pB[2+bs*3];
+ tmp3 = pB[3+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp1 -= pB[1+bs*0] * u_03;
+ tmp2 -= pB[2+bs*0] * u_03;
+ tmp3 -= pB[3+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp1 -= pB[1+bs*1] * u_13;
+ tmp2 -= pB[2+bs*1] * u_13;
+ tmp3 -= pB[3+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ tmp1 -= pB[1+bs*2] * u_23;
+ tmp2 -= pB[2+bs*2] * u_23;
+ tmp3 -= pB[3+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB[1+bs*3] = tmp1;
+ pB[2+bs*3] = tmp2;
+ pB[3+bs*3] = tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ tmp0 = pB[0+bs*3];
+ tmp0 -= pB[0+bs*0] * u_03;
+ tmp0 -= pB[0+bs*1] * u_13;
+ tmp0 -= pB[0+bs*2] * u_23;
+ pB[0+bs*3] = tmp0;
+ pB += 1;
+ }
+ }
+ else // m = {2,3}
+ {
+ u_03 = pA[0+bs*3];
+ u_13 = pA[1+bs*3];
+ u_13 -= pA[1+bs*0] * u_03;
+ pA[1+bs*3] = u_13;
+ if(m>2)
+ {
+ u_23 = pA[2+bs*3];
+ u_23 -= pA[2+bs*0] * u_03;
+ u_23 -= pA[2+bs*1] * u_13;
+ pA[2+bs*3] = u_23;
+ }
+ }
+
+ if(m>3)
+ {
+ // find pivot & scale
+ sidamax_lib4(m-3, 3, &pA[3+bs*3], sda, &idamax, &tmp3);
+ ipiv[3] = idamax+3;
+ if(tmp3!=0)
+ {
+ if(ipiv[3]!=3)
+ srowsw_lib(4, pA+3, pA+ipiv[3]/bs*bs*sda+ipiv[3]%bs);
+
+ tmp3 = 1.0 / pA[3+bs*3];
+ inv_diag_A[3] = tmp3;
+ pB = pA + bs*sda;
+ for(k=0; k<ma-3; k+=4)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB[1+bs*3] *= tmp3;
+ pB[2+bs*3] *= tmp3;
+ pB[3+bs*3] *= tmp3;
+ pB += bs*sda;
+ }
+ for( ; k<ma; k++)
+ {
+ pB[0+bs*3] *= tmp3;
+ pB += 1;
+ }
+ }
+ else
+ {
+ inv_diag_A[3] = 0.0;
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+
+
+
diff --git a/kernel/c99/kernel_ssymv_4_lib4.c b/kernel/c99/kernel_ssymv_4_lib4.c
new file mode 100644
index 0000000..5512154
--- /dev/null
+++ b/kernel/c99/kernel_ssymv_4_lib4.c
@@ -0,0 +1,1025 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_sgemv_nt_4_vs_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha_n[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha_n[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha_n[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha_n[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ // store t
+ z_t[0] = alpha_t[0]*y_t_0 + beta_t[0]*y_t[0];
+ if(km>1)
+ {
+ z_t[1] = alpha_t[0]*y_t_1 + beta_t[0]*y_t[1];
+ if(km>2)
+ {
+ z_t[2] = alpha_t[0]*y_t_2 + beta_t[0]*y_t[2];
+ if(km>3)
+ {
+ z_t[3] = alpha_t[0]*y_t_3 + beta_t[0]*y_t[3];
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_sgemv_nt_4_lib4(int kmax, float *alpha_n, float *alpha_t, float *A, int sda, float *x_n, float *x_t, float *beta_t, float *y_t, float *z_n, float *z_t)
+ {
+
+ kernel_sgemv_nt_4_vs_lib4(kmax, alpha_n, alpha_t, A, sda, x_n, x_t, beta_t, y_t, z_n, z_t, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_ssymv_l_4_gen_lib4(int kmax, float *alpha, int offA, float *A, int sda, float *x_n, float *z_n, int km)
+ {
+
+ if(kmax<=0)
+ return;
+
+ float *x_t = x_n;
+ float *z_t = z_n;
+
+ const int bs = 4;
+
+ int k;
+
+ float
+ a_00, a_01, a_02, a_03,
+ x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
+ x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
+
+ x_n_0 = 0;
+ x_n_1 = 0;
+ x_n_2 = 0;
+ x_n_3 = 0;
+
+ x_n_0 = alpha[0]*x_n[0];
+ if(km>1)
+ {
+ x_n_1 = alpha[0]*x_n[1];
+ if(km>2)
+ {
+ x_n_2 = alpha[0]*x_n[2];
+ if(km>3)
+ {
+ x_n_3 = alpha[0]*x_n[3];
+ }
+ }
+ }
+
+ y_t_0 = 0;
+ y_t_1 = 0;
+ y_t_2 = 0;
+ y_t_3 = 0;
+
+ k = 0;
+ if(offA==0)
+ {
+ if(kmax<4)
+ {
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+ goto store_t;
+ }
+ else
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+ k += 4;
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ }
+ else if(offA==1)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==6)
+ goto store_t;
+
+ // 6
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==7)
+ goto store_t;
+
+ k += 7;
+
+ }
+ else if(offA==2)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==5)
+ goto store_t;
+
+ // 5
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==6)
+ goto store_t;
+
+ k += 6;
+
+ }
+ else // if(offA==3)
+ {
+
+ // 0
+
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+
+ y_t_0 += a_00 * x_t_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==1)
+ goto store_t;
+
+ // 1
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_t_1 += a_01 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==2)
+ goto store_t;
+
+ // 2
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_t_2 += a_02 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==3)
+ goto store_t;
+
+ // 3
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ if(kmax==4)
+ goto store_t;
+
+ // 4
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ A += (sda-1)*bs; // new panel
+
+ if(kmax==5)
+ goto store_t;
+
+ k += 5;
+
+ }
+ for(; k<kmax-3; k+=bs)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+
+ // 1
+
+ y_n_0 = z_n[1];
+ x_t_0 = x_t[1];
+
+ a_00 = A[1+bs*0];
+ a_01 = A[1+bs*1];
+ a_02 = A[1+bs*2];
+ a_03 = A[1+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[1] = y_n_0;
+
+
+ // 2
+
+ y_n_0 = z_n[2];
+ x_t_0 = x_t[2];
+
+ a_00 = A[2+bs*0];
+ a_01 = A[2+bs*1];
+ a_02 = A[2+bs*2];
+ a_03 = A[2+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[2] = y_n_0;
+
+
+ // 3
+
+ y_n_0 = z_n[3];
+ x_t_0 = x_t[3];
+
+ a_00 = A[3+bs*0];
+ a_01 = A[3+bs*1];
+ a_02 = A[3+bs*2];
+ a_03 = A[3+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[3] = y_n_0;
+
+
+ A += sda*bs;
+ z_n += 4;
+ x_t += 4;
+
+ }
+ for(; k<kmax; k++)
+ {
+
+ // 0
+
+ y_n_0 = z_n[0];
+ x_t_0 = x_t[0];
+
+ a_00 = A[0+bs*0];
+ a_01 = A[0+bs*1];
+ a_02 = A[0+bs*2];
+ a_03 = A[0+bs*3];
+
+ y_n_0 += a_00 * x_n_0;
+ y_t_0 += a_00 * x_t_0;
+ y_n_0 += a_01 * x_n_1;
+ y_t_1 += a_01 * x_t_0;
+ y_n_0 += a_02 * x_n_2;
+ y_t_2 += a_02 * x_t_0;
+ y_n_0 += a_03 * x_n_3;
+ y_t_3 += a_03 * x_t_0;
+
+ z_n[0] = y_n_0;
+
+ A += 1;
+ z_n += 1;
+ x_t += 1;
+
+ }
+
+ store_t:
+ z_t[0] += alpha[0]*y_t_0;
+ if(km>1)
+ {
+ z_t[1] += alpha[0]*y_t_1;
+ if(km>2)
+ {
+ z_t[2] += alpha[0]*y_t_2;
+ if(km>3)
+ {
+ z_t[3] += alpha[0]*y_t_3;
+ }
+ }
+ }
+
+ return;
+
+ }
+#endif
+
+
+
+// XXX copy and scale y_n into z_n outside the kernel !!!!!
+#if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15)
+void kernel_ssymv_l_4_lib4(int kmax, float *alpha, float *A, int sda, float *x_n, float *z_n)
+ {
+
+ kernel_ssymv_l_4_gen_lib4(kmax, alpha, 0, A, sda, x_n, z_n, 4);
+
+ return;
+
+ }
+#endif
+
+
+
+
+
diff --git a/kernel/fma/Makefile b/kernel/fma/Makefile
new file mode 100644
index 0000000..d7be280
--- /dev/null
+++ b/kernel/fma/Makefile
@@ -0,0 +1,49 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_AMD_BULLDOZER)
+OBJS += kernel_dgemm_4x4_lib4.o
+OBJS +=
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/fma/kernel_dgemm_4x4_lib4.S b/kernel/fma/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..a02f37d
--- /dev/null
+++ b/kernel/fma/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,3895 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp);
+#define EPILOGUE \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp);
+#define EPILOGUE \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 0(%r11), %xmm8 // A[0]
+ vmovapd 16(%r11), %xmm9 // A[2]
+
+ vmovddup 0(%r12), %xmm12 // B[0]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 8(%r12), %xmm12 // B[1]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 16(%r12), %xmm12 // B[2]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 24(%r12), %xmm12 // B[3]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ subl $4, %r10d
+
+
+ // unroll 1
+ vmovapd 32(%r11), %xmm8 // A[4]
+ vmovapd 48(%r11), %xmm9 // A[6]
+
+ vmovddup 32(%r12), %xmm12 // B[4]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 40(%r12), %xmm12 // B[5]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 48(%r12), %xmm12 // B[6]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 56(%r12), %xmm12 // B[7]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 2
+ vmovapd 64(%r11), %xmm8 // A[8]
+ vmovapd 80(%r11), %xmm9 // A[10]
+
+ vmovddup 64(%r12), %xmm12 // B[8]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 72(%r12), %xmm12 // B[9]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 80(%r12), %xmm12 // B[10]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 88(%r12), %xmm12 // B[11]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 3
+ vmovapd 96(%r11), %xmm8 // A[12]
+ vmovapd 112(%r11), %xmm9 // A[14]
+
+ vmovddup 96(%r12), %xmm12 // B[12]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 104(%r12), %xmm12 // B[13]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 112(%r12), %xmm12 // B[14]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 120(%r12), %xmm12 // B[15]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ addq $128, %r11
+ addq $128, %r12
+
+
+ cmpl $4, %r10d
+
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmovapd 0(%r11), %xmm8 // A[0]
+ vmovapd 16(%r11), %xmm9 // A[2]
+
+ vmovddup 0(%r12), %xmm12 // B[0]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 8(%r12), %xmm12 // B[1]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 16(%r12), %xmm12 // B[2]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 24(%r12), %xmm12 // B[3]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 1
+ vmovapd 32(%r11), %xmm8 // A[4]
+ vmovapd 48(%r11), %xmm9 // A[6]
+
+ vmovddup 32(%r12), %xmm12 // B[4]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 40(%r12), %xmm12 // B[5]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 48(%r12), %xmm12 // B[6]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 56(%r12), %xmm12 // B[7]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 2
+ vmovapd 64(%r11), %xmm8 // A[8]
+ vmovapd 80(%r11), %xmm9 // A[10]
+
+ vmovddup 64(%r12), %xmm12 // B[8]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 72(%r12), %xmm12 // B[9]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 80(%r12), %xmm12 // B[10]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 88(%r12), %xmm12 // B[11]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 3
+ vmovapd 96(%r11), %xmm8 // A[12]
+ vmovapd 112(%r11), %xmm9 // A[14]
+
+ vmovddup 96(%r12), %xmm12 // B[12]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 104(%r12), %xmm12 // B[13]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 112(%r12), %xmm12 // B[14]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 120(%r12), %xmm12 // B[15]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ addq $128, %r12
+ addq $128, %r11
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %xmm8 // A[0]
+ vmovapd 16(%r11), %xmm9 // A[2]
+
+ vmovddup 0(%r12), %xmm12 // B[0]
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ subl $1, %r10d
+
+ vmovddup 8(%r12), %xmm12 // B[1]
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 16(%r12), %xmm12 // B[2]
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 24(%r12), %xmm12 // B[3]
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ vmovapd 0(%r11), %xmm8 // A[0]
+ vmovapd 16(%r11), %xmm9 // A[2]
+
+ vmovddup 0(%r12), %xmm12 // B[0]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 8(%r12), %xmm12 // B[1]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 16(%r12), %xmm12 // B[2]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 24(%r12), %xmm12 // B[3]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ subl $4, %r10d
+
+
+ // unroll 1
+ vmovapd 32(%r11), %xmm8 // A[4]
+ vmovapd 48(%r11), %xmm9 // A[6]
+
+ vmovddup 32(%r12), %xmm12 // B[4]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 40(%r12), %xmm12 // B[5]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 48(%r12), %xmm12 // B[6]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 56(%r12), %xmm12 // B[7]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 2
+ vmovapd 64(%r11), %xmm8 // A[8]
+ vmovapd 80(%r11), %xmm9 // A[10]
+
+ vmovddup 64(%r12), %xmm12 // B[8]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 72(%r12), %xmm12 // B[9]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 80(%r12), %xmm12 // B[10]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 88(%r12), %xmm12 // B[11]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 3
+ vmovapd 96(%r11), %xmm8 // A[12]
+ vmovapd 112(%r11), %xmm9 // A[14]
+
+ vmovddup 96(%r12), %xmm12 // B[12]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 104(%r12), %xmm12 // B[13]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 112(%r12), %xmm12 // B[14]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 120(%r12), %xmm12 // B[15]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ addq $128, %r12
+ addq $128, %r11
+
+
+ cmpl $4, %r10d
+
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ vmovapd 0(%r11), %xmm8 // A[0]
+ vmovapd 16(%r11), %xmm9 // A[2]
+
+ vmovddup 0(%r12), %xmm12 // B[0]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 8(%r12), %xmm12 // B[1]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 16(%r12), %xmm12 // B[2]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 24(%r12), %xmm12 // B[3]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 1
+ vmovapd 32(%r11), %xmm8 // A[4]
+ vmovapd 48(%r11), %xmm9 // A[6]
+
+ vmovddup 32(%r12), %xmm12 // B[4]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 40(%r12), %xmm12 // B[5]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 48(%r12), %xmm12 // B[6]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 56(%r12), %xmm12 // B[7]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 2
+ vmovapd 64(%r11), %xmm8 // A[8]
+ vmovapd 80(%r11), %xmm9 // A[10]
+
+ vmovddup 64(%r12), %xmm12 // B[8]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 72(%r12), %xmm12 // B[9]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 80(%r12), %xmm12 // B[10]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 88(%r12), %xmm12 // B[11]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ // unroll 3
+ vmovapd 96(%r11), %xmm8 // A[12]
+ vmovapd 112(%r11), %xmm9 // A[14]
+
+ vmovddup 96(%r12), %xmm12 // B[12]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovddup 104(%r12), %xmm12 // B[13]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 112(%r12), %xmm12 // B[14]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 120(%r12), %xmm12 // B[15]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+
+ addq $128, %r12
+ addq $128, %r11
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+ // unroll 0
+ vmovapd 0(%r11), %xmm8 // A[0]
+ vmovapd 16(%r11), %xmm9 // A[2]
+
+ vmovddup 0(%r12), %xmm12 // B[0]
+ vfnmadd231pd %xmm8, %xmm12, %xmm0
+ vfnmadd231pd %xmm9, %xmm12, %xmm1
+
+ subl $1, %r10d
+
+ vmovddup 8(%r12), %xmm12 // B[1]
+ vfnmadd231pd %xmm8, %xmm12, %xmm2
+ vfnmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovddup 16(%r12), %xmm12 // B[2]
+ vfnmadd231pd %xmm8, %xmm12, %xmm4
+ vfnmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovddup 24(%r12), %xmm12 // B[3]
+ vfnmadd231pd %xmm8, %xmm12, %xmm6
+ vfnmadd231pd %xmm9, %xmm12, %xmm7
+
+ addq $32, %r12
+ addq $32, %r11
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %xmm8
+ vmovapd 16(%r10), %xmm9
+ vmovddup 0(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+
+ vmovapd 32(%r10), %xmm8
+ vmovapd 48(%r10), %xmm9
+ vmovddup 32(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ vmovddup 40(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+
+ vmovapd 64(%r10), %xmm8
+ vmovapd 80(%r10), %xmm9
+ vmovddup 64(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ vmovddup 72(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+ vmovddup 80(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+
+ vmovapd 96(%r10), %xmm8
+ vmovapd 112(%r10), %xmm9
+ vmovddup 96(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ vmovddup 104(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+ vmovddup 112(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+ vmovddup 120(%r11), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+
+ addq $128, %r10
+ addq $128, %r11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r11), %xmm8
+ vmovapd 16(%r11), %xmm9
+ subl $1, %r10d
+ vmovddup 0(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %xmm8
+ vmovapd 16(%r11), %xmm9
+ subl $1, %r10d
+ vmovddup 0(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ addq $32, %r11
+ vmovddup 8(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %xmm8
+ vmovapd 16(%r11), %xmm9
+ subl $1, %r10d
+ vmovddup 0(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ vmovddup 8(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+ addq $32, %r11
+ vmovddup 16(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ vmovapd 0(%r11), %xmm8
+ vmovapd 16(%r11), %xmm9
+ subl $1, %r10d
+ vmovddup 0(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm0
+ vfmadd231pd %xmm9, %xmm12, %xmm1
+ vmovddup 8(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm2
+ vfmadd231pd %xmm9, %xmm12, %xmm3
+ vmovddup 16(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm4
+ vfmadd231pd %xmm9, %xmm12, %xmm5
+ addq $32, %r11
+ vmovddup 24(%r12), %xmm12
+ vfmadd231pd %xmm8, %xmm12, %xmm6
+ vfmadd231pd %xmm9, %xmm12, %xmm7
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif
+#endif
+
+ // XXX nothing to blend
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // XXX nothing to blend
+
+ // alpha
+ movddup 0(%r10), %xmm15
+
+ mulpd %xmm15, %xmm0
+ mulpd %xmm15, %xmm1
+ mulpd %xmm15, %xmm2
+ mulpd %xmm15, %xmm3
+
+
+ // beta
+ movddup 0(%r11), %xmm14
+
+
+ vmovapd 0(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm0
+ vmovapd 16(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm1
+ vmovapd 32(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm2
+ vmovapd 48(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm3
+ vmovapd 64(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm4
+ vmovapd 80(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm5
+ vmovapd 96(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm6
+ vmovapd 112(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ movddup 0(%r10), %xmm15
+
+ mulpd %xmm15, %xmm0
+ mulpd %xmm15, %xmm1
+ mulpd %xmm15, %xmm2
+ mulpd %xmm15, %xmm3
+
+
+ // beta
+ movddup 0(%r11), %xmm14
+
+
+ vmovapd 0(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm0
+ vmovapd 16(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm1
+ vmovapd 32(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm2
+ vmovapd 48(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm3
+ vmovapd 64(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm4
+ vmovapd 80(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm5
+ vmovapd 96(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm6
+ vmovapd 112(%r12), %xmm15
+ vfmadd231pd %xmm14, %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif
+#endif
+
+ vmovapd 0(%r10), %xmm15
+ vaddpd %xmm0, %xmm15, %xmm0
+ vmovapd 16(%r10), %xmm15
+ vaddpd %xmm1, %xmm15, %xmm1
+ vmovapd 32(%r10), %xmm15
+ vaddpd %xmm2, %xmm15, %xmm2
+ vmovapd 48(%r10), %xmm15
+ vaddpd %xmm3, %xmm15, %xmm3
+ vmovapd 64(%r10), %xmm15
+ vaddpd %xmm4, %xmm15, %xmm4
+ vmovapd 80(%r10), %xmm15
+ vaddpd %xmm5, %xmm15, %xmm5
+ vmovapd 96(%r10), %xmm15
+ vaddpd %xmm6, %xmm15, %xmm6
+ vmovapd 112(%r10), %xmm15
+ vaddpd %xmm7, %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+
+ vxorpd %xmm15, %xmm15, %xmm15 // 0.0
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ vmovsd .LC04(%rip), %xmm14 // 1.0
+#elif defined(OS_MAC)
+ vmovsd LC04(%rip), %xmm14 // 1.0
+#endif
+
+ vmovsd %xmm0, %xmm0, %xmm13
+ vucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+2:
+ cmpl $2, %r11d
+ vmovsd %xmm13, 0(%r10)
+ vmovddup %xmm13, %xmm13
+ vmulpd %xmm0, %xmm13, %xmm0
+ vmulpd %xmm1, %xmm13, %xmm1
+
+ jl 0f // ret
+
+ vpermilpd $0x3, %xmm0, %xmm13
+ vfnmadd231pd %xmm0, %xmm13, %xmm2
+ vfnmadd231pd %xmm1, %xmm13, %xmm3
+ vpermilpd $0x3, %xmm2, %xmm13
+ vucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+4:
+ cmpl $3, %r11d
+ vmovsd %xmm13, 8(%r10)
+ vmovddup %xmm13, %xmm13
+ vmulpd %xmm2, %xmm13, %xmm2
+ vmulpd %xmm3, %xmm13, %xmm3
+
+ jl 0f // ret
+
+ vpermilpd $0x0, %xmm1, %xmm13
+// vfnmadd231pd %xmm0, %xmm13, %xmm4
+ vfnmadd231pd %xmm1, %xmm13, %xmm5
+ vpermilpd $0x0, %xmm3, %xmm13
+// vfnmadd231pd %xmm2, %xmm13, %xmm4
+ vfnmadd231pd %xmm3, %xmm13, %xmm5
+ vmovaps %xmm5, %xmm13
+ vucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+6:
+ cmpl $4, %r11d
+ vmovsd %xmm13, 16(%r10)
+ vmovddup %xmm13, %xmm13
+// vmulpd %xmm4, %xmm13, %xmm4
+ vmulpd %xmm5, %xmm13, %xmm5
+
+ jl 0f // ret
+
+ vpermilpd $0x3, %xmm1, %xmm13
+// vfnmadd231pd %xmm0, %xmm13, %xmm6
+ vfnmadd231pd %xmm1, %xmm13, %xmm7
+ vpermilpd $0x3, %xmm3, %xmm13
+// vfnmadd231pd %xmm2, %xmm13, %xmm6
+ vfnmadd231pd %xmm3, %xmm13, %xmm7
+ vpermilpd $0x3, %xmm5, %xmm13
+// vfnmadd231pd %xmm4, %xmm13, %xmm6
+ vfnmadd231pd %xmm5, %xmm13, %xmm7
+ vpermilpd $0x3, %xmm7, %xmm13
+ vucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ vsqrtsd %xmm13, %xmm13, %xmm13
+ vdivsd %xmm13, %xmm14, %xmm13
+8:
+ vmovsd %xmm13, 24(%r10)
+ vmovddup %xmm13, %xmm13
+// vmulpd %xmm6, %xmm13, %xmm6
+ vmulpd %xmm7, %xmm13, %xmm7
+
+ jmp 0f
+
+1:
+ vxorpd %xmm13, %xmm13, %xmm13
+ jmp 2b
+
+3:
+ vxorpd %xmm13, %xmm13, %xmm13
+ jmp 4b
+
+5:
+ vxorpd %xmm13, %xmm13, %xmm13
+ jmp 6b
+
+7:
+ vxorpd %xmm13, %xmm13, %xmm13
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+
+ vmovddup 0(%r11), %xmm13
+ vmulpd %xmm0, %xmm13, %xmm0
+ vmulpd %xmm1, %xmm13, %xmm1
+
+ vmovddup 8(%r10), %xmm13
+ vfnmadd231pd %xmm0, %xmm13, %xmm2
+ vfnmadd231pd %xmm1, %xmm13, %xmm3
+ vmovddup 8(%r11), %xmm13
+ vmulpd %xmm2, %xmm13, %xmm2
+ vmulpd %xmm3, %xmm13, %xmm3
+
+ vmovddup 16(%r10), %xmm13
+ vfnmadd231pd %xmm0, %xmm13, %xmm4
+ vfnmadd231pd %xmm1, %xmm13, %xmm5
+ vmovddup 48(%r10), %xmm13
+ vfnmadd231pd %xmm2, %xmm13, %xmm4
+ vfnmadd231pd %xmm3, %xmm13, %xmm5
+ vmovddup 16(%r11), %xmm13
+ vmulpd %xmm4, %xmm13, %xmm4
+ vmulpd %xmm5, %xmm13, %xmm5
+
+ vmovddup 24(%r10), %xmm13
+ vfnmadd231pd %xmm0, %xmm13, %xmm6
+ vfnmadd231pd %xmm1, %xmm13, %xmm7
+ vmovddup 56(%r10), %xmm13
+ vfnmadd231pd %xmm2, %xmm13, %xmm6
+ vfnmadd231pd %xmm3, %xmm13, %xmm7
+ vmovddup 88(%r10), %xmm13
+ vfnmadd231pd %xmm4, %xmm13, %xmm6
+ vfnmadd231pd %xmm5, %xmm13, %xmm7
+ vmovddup 24(%r11), %xmm13
+ vmulpd %xmm6, %xmm13, %xmm6
+ vmulpd %xmm7, %xmm13, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ vmovddup 0(%r11), %xmm13
+ cmpl $2, %r12d
+ vmulpd %xmm0, %xmm13, %xmm0
+ vmulpd %xmm1, %xmm13, %xmm1
+
+ jl 0f // ret
+
+ vmovddup 8(%r10), %xmm13
+ cmpl $3, %r12d
+ vfnmadd231pd %xmm0, %xmm13, %xmm2
+ vfnmadd231pd %xmm1, %xmm13, %xmm3
+ vmovddup 8(%r11), %xmm13
+ vmulpd %xmm2, %xmm13, %xmm2
+ vmulpd %xmm3, %xmm13, %xmm3
+
+ jl 0f // ret
+
+ vmovddup 16(%r10), %xmm13
+ cmpl $4, %r12d
+ vfnmadd231pd %xmm0, %xmm13, %xmm4
+ vfnmadd231pd %xmm1, %xmm13, %xmm5
+ vmovddup 48(%r10), %xmm13
+ vfnmadd231pd %xmm2, %xmm13, %xmm4
+ vfnmadd231pd %xmm3, %xmm13, %xmm5
+ vmovddup 16(%r11), %xmm13
+ vmulpd %xmm4, %xmm13, %xmm4
+ vmulpd %xmm5, %xmm13, %xmm5
+
+ jl 0f // ret
+
+ vmovddup 24(%r10), %xmm13
+ vfnmadd231pd %xmm0, %xmm13, %xmm6
+ vfnmadd231pd %xmm1, %xmm13, %xmm7
+ vmovddup 56(%r10), %xmm13
+ vfnmadd231pd %xmm2, %xmm13, %xmm6
+ vfnmadd231pd %xmm3, %xmm13, %xmm7
+ vmovddup 88(%r10), %xmm13
+ vfnmadd231pd %xmm4, %xmm13, %xmm6
+ vfnmadd231pd %xmm5, %xmm13, %xmm7
+ vmovddup 24(%r11), %xmm13
+ vmulpd %xmm6, %xmm13, %xmm6
+ vmulpd %xmm7, %xmm13, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %xmm0, 0(%r10)
+ vmovapd %xmm1, 16(%r10)
+ vmovapd %xmm2, 32(%r10)
+ vmovapd %xmm3, 48(%r10)
+ vmovapd %xmm4, 64(%r10)
+ vmovapd %xmm5, 80(%r10)
+ vmovapd %xmm6, 96(%r10)
+ vmovapd %xmm7, 112(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// TODO use blendv instead
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+ jg 1f
+ je 0f
+
+ // km==1
+ cmpl $2, %r12d
+ vmovsd %xmm0, 0(%r10)
+ jl 4f // end
+ cmpl $3, %r12d
+ vmovsd %xmm2, 32(%r10)
+ jl 4f // end
+ vmovsd %xmm4, 64(%r10)
+ je 4f // end
+ vmovsd %xmm6, 96(%r10)
+
+ jmp 4f
+
+0:
+ // km==2
+ cmpl $2, %r12d
+ vmovapd %xmm0, 0(%r10)
+ jl 4f // end
+ cmpl $3, %r12d
+ vmovapd %xmm2, 32(%r10)
+ jl 4f // end
+ vmovapd %xmm4, 64(%r10)
+ je 4f // end
+ vmovapd %xmm6, 96(%r10)
+
+ jmp 4f
+
+1:
+ cmpl $3, %r11d
+ jg 2f
+
+ // km==3
+ cmpl $2, %r12d
+ vmovapd %xmm0, 0(%r10)
+ vmovsd %xmm1, 16(%r10)
+ jl 4f // end
+ cmpl $3, %r12d
+ vmovapd %xmm2, 32(%r10)
+ vmovsd %xmm3, 48(%r10)
+ jl 4f // end
+ vmovapd %xmm4, 64(%r10)
+ vmovsd %xmm5, 80(%r10)
+ je 4f // end
+ vmovapd %xmm6, 96(%r10)
+ vmovsd %xmm7, 112(%r10)
+
+ jmp 4f
+
+2:
+ // km==3
+ cmpl $2, %r12d
+ vmovapd %xmm0, 0(%r10)
+ vmovapd %xmm1, 16(%r10)
+ jl 4f // end
+ cmpl $3, %r12d
+ vmovapd %xmm2, 32(%r10)
+ vmovapd %xmm3, 48(%r10)
+ jl 4f // end
+ vmovapd %xmm4, 64(%r10)
+ vmovapd %xmm5, 80(%r10)
+ je 4f // end
+ vmovapd %xmm6, 96(%r10)
+ vmovapd %xmm7, 112(%r10)
+
+4:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+
+ vmovapd %xmm0, 0(%r10)
+ vmovapd %xmm1, 16(%r10)
+ vmovsd 32(%r10), %xmm15
+ vmovsd %xmm15, %xmm2, %xmm2
+ vmovapd %xmm2, 32(%r10)
+ vmovapd %xmm3, 48(%r10)
+// vmovapd %xmm4, 64(%r10)
+ vmovapd %xmm5, 80(%r10)
+// vmovapd %xmm6, 96(%r10)
+ vmovsd 112(%r10), %xmm15
+ vmovsd %xmm15, %xmm7, %xmm7
+ vmovapd %xmm7, 112(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+ jg 1f
+ je 0f
+
+ // km==1
+ vmovsd %xmm0, 0(%r10)
+
+ jmp 3f
+
+0:
+ // km==2
+ cmpl $2, %r12d
+ vmovapd %xmm0, 0(%r10)
+ jl 3f // end
+ vmovsd 32(%r10), %xmm15
+ vmovsd %xmm15, %xmm2, %xmm2
+ vmovapd %xmm2, 32(%r10)
+
+ jmp 3f
+
+1:
+ cmpl $3, %r11d
+ jg 2f
+
+ // km==3
+ cmpl $2, %r12d
+ vmovapd %xmm0, 0(%r10)
+ vmovsd %xmm1, 16(%r10)
+ jl 3f // end
+ cmpl $3, %r12d
+ vmovsd 32(%r10), %xmm15
+ vmovsd %xmm15, %xmm2, %xmm2
+ vmovapd %xmm2, 32(%r10)
+ vmovsd %xmm3, 48(%r10)
+ jl 3f // end
+// vmovapd %xmm4, 64(%r10)
+ vmovsd %xmm5, 80(%r10)
+
+ jmp 3f
+
+2:
+ // km==3
+ cmpl $2, %r12d
+ vmovapd %xmm0, 0(%r10)
+ vmovapd %xmm1, 16(%r10)
+ jl 3f // end
+ cmpl $3, %r12d
+ vmovsd 32(%r10), %xmm15
+ vmovsd %xmm15, %xmm2, %xmm2
+ vmovapd %xmm2, 32(%r10)
+ vmovapd %xmm3, 48(%r10)
+ jl 3f // end
+// vmovapd %xmm4, 64(%r10)
+ vmovapd %xmm5, 80(%r10)
+ je 3f // end
+// vmovapd %xmm6, 96(%r10)
+ vmovsd 112(%r10), %xmm15
+ vmovsd %xmm15, %xmm7, %xmm7
+ vmovapd %xmm7, 112(%r10)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10
+ movq ARG4, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movl $4, %r11d // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+
diff --git a/kernel/sse3/Makefile b/kernel/sse3/Makefile
new file mode 100644
index 0000000..dbc07d1
--- /dev/null
+++ b/kernel/sse3/Makefile
@@ -0,0 +1,49 @@
+###################################################################################################
+# #
+# This file is part of BLASFEO. #
+# #
+# BLASFEO -- BLAS For Embedded Optimization. #
+# Copyright (C) 2016-2017 by Gianluca Frison. #
+# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. #
+# All rights reserved. #
+# #
+# HPMPC is free software; you can redistribute it and/or #
+# modify it under the terms of the GNU Lesser General Public #
+# License as published by the Free Software Foundation; either #
+# version 2.1 of the License, or (at your option) any later version. #
+# #
+# HPMPC is distributed in the hope that it will be useful, #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. #
+# See the GNU Lesser General Public License for more details. #
+# #
+# You should have received a copy of the GNU Lesser General Public #
+# License along with HPMPC; if not, write to the Free Software #
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #
+# #
+# Author: Gianluca Frison, giaf (at) dtu.dk #
+# gianluca.frison (at) imtek.uni-freiburg.de #
+# #
+###################################################################################################
+
+include ../../Makefile.rule
+
+OBJS =
+
+ifeq ($(LA), HIGH_PERFORMANCE)
+
+ifeq ($(TARGET), X64_INTEL_CORE)
+OBJS += kernel_dgemm_4x4_lib4.o
+OBJS +=
+endif
+
+else # LA_REFERENCE | LA_BLAS
+
+endif # LA choice
+
+obj: $(OBJS)
+
+clean:
+ rm -f *.o
+ rm -f *.s
+
diff --git a/kernel/sse3/kernel_dgemm_4x4_lib4.S b/kernel/sse3/kernel_dgemm_4x4_lib4.S
new file mode 100644
index 0000000..26f35b6
--- /dev/null
+++ b/kernel/sse3/kernel_dgemm_4x4_lib4.S
@@ -0,0 +1,6235 @@
+/**************************************************************************************************
+* *
+* This file is part of BLASFEO. *
+* *
+* BLASFEO -- BLAS For Embedded Optimization. *
+* Copyright (C) 2016-2017 by Gianluca Frison. *
+* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
+* All rights reserved. *
+* *
+* HPMPC is free software; you can redistribute it and/or *
+* modify it under the terms of the GNU Lesser General Public *
+* License as published by the Free Software Foundation; either *
+* version 2.1 of the License, or (at your option) any later version. *
+* *
+* HPMPC is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
+* See the GNU Lesser General Public License for more details. *
+* *
+* You should have received a copy of the GNU Lesser General Public *
+* License along with HPMPC; if not, write to the Free Software *
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
+* *
+* Author: Gianluca Frison, giaf (at) dtu.dk *
+* gianluca.frison (at) imtek.uni-freiburg.de *
+* *
+**************************************************************************************************/
+
+#if defined(OS_LINUX) | defined(OS_MAC)
+
+//#define STACKSIZE 96
+#define STACKSIZE 64
+#define ARG1 %rdi
+#define ARG2 %rsi
+#define ARG3 %rdx
+#define ARG4 %rcx
+#define ARG5 %r8
+#define ARG6 %r9
+#define ARG7 STACKSIZE + 8(%rsp)
+#define ARG8 STACKSIZE + 16(%rsp)
+#define ARG9 STACKSIZE + 24(%rsp)
+#define ARG10 STACKSIZE + 32(%rsp)
+#define ARG11 STACKSIZE + 40(%rsp)
+#define ARG12 STACKSIZE + 48(%rsp)
+#define ARG13 STACKSIZE + 56(%rsp)
+#define ARG14 STACKSIZE + 64(%rsp)
+#define ARG15 STACKSIZE + 72(%rsp)
+#define ARG16 STACKSIZE + 80(%rsp)
+#define ARG17 STACKSIZE + 88(%rsp)
+#define ARG18 STACKSIZE + 96(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp);
+#define EPILOGUE \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ addq $STACKSIZE, %rsp;
+
+#elif defined(OS_WINDOWS)
+
+#define STACKSIZE 256
+#define ARG1 %rcx
+#define ARG2 %rdx
+#define ARG3 %r8
+#define ARG4 %r9
+#define ARG5 STACKSIZE + 40(%rsp)
+#define ARG6 STACKSIZE + 48(%rsp)
+#define ARG7 STACKSIZE + 56(%rsp)
+#define ARG8 STACKSIZE + 64(%rsp)
+#define ARG9 STACKSIZE + 72(%rsp)
+#define ARG10 STACKSIZE + 80(%rsp)
+#define ARG11 STACKSIZE + 88(%rsp)
+#define ARG12 STACKSIZE + 96(%rsp)
+#define ARG13 STACKSIZE + 104(%rsp)
+#define ARG14 STACKSIZE + 112(%rsp)
+#define ARG15 STACKSIZE + 120(%rsp)
+#define ARG16 STACKSIZE + 128(%rsp)
+#define ARG17 STACKSIZE + 136(%rsp)
+#define ARG18 STACKSIZE + 144(%rsp)
+#define PROLOGUE \
+ subq $STACKSIZE, %rsp; \
+ movq %rbx, (%rsp); \
+ movq %rbp, 8(%rsp); \
+ movq %r12, 16(%rsp); \
+ movq %r13, 24(%rsp); \
+ movq %r14, 32(%rsp); \
+ movq %r15, 40(%rsp); \
+ movq %rdi, 48(%rsp); \
+ movq %rsi, 56(%rsp); \
+ vmovups %xmm6, 64(%rsp); \
+ vmovups %xmm7, 80(%rsp); \
+ vmovups %xmm8, 96(%rsp); \
+ vmovups %xmm9, 112(%rsp); \
+ vmovups %xmm10, 128(%rsp); \
+ vmovups %xmm11, 144(%rsp); \
+ vmovups %xmm12, 160(%rsp); \
+ vmovups %xmm13, 176(%rsp); \
+ vmovups %xmm14, 192(%rsp); \
+ vmovups %xmm15, 208(%rsp);
+#define EPILOGUE \
+ movq (%rsp), %rbx; \
+ movq 8(%rsp), %rbp; \
+ movq 16(%rsp), %r12; \
+ movq 24(%rsp), %r13; \
+ movq 32(%rsp), %r14; \
+ movq 40(%rsp), %r15; \
+ movq 48(%rsp), %rdi; \
+ movq 56(%rsp), %rsi; \
+ vmovups 64(%rsp), %xmm6; \
+ vmovups 80(%rsp), %xmm7; \
+ vmovups 96(%rsp), %xmm8; \
+ vmovups 112(%rsp), %xmm9; \
+ vmovups 128(%rsp), %xmm10; \
+ vmovups 144(%rsp), %xmm11; \
+ vmovups 160(%rsp), %xmm12; \
+ vmovups 176(%rsp), %xmm13; \
+ vmovups 192(%rsp), %xmm14; \
+ vmovups 208(%rsp), %xmm15; \
+ addq $STACKSIZE, %rsp;
+
+#else
+
+#error wrong OS
+
+#endif
+
+
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .text
+#elif defined(OS_MAC)
+ .section __TEXT,__text,regular,pure_instructions
+#endif
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nt_4x4_lib4, @function
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+ movapd 0(%r12), %xmm10 // B[0]
+
+ xorpd %xmm11, %xmm11
+ movapd %xmm11, %xmm12
+ movapd %xmm11, %xmm13
+ movapd %xmm11, %xmm14
+ movapd %xmm11, %xmm15
+
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ addpd %xmm14, %xmm3
+ movapd 16(%r12), %xmm14 // B[2]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ addpd %xmm10, %xmm1
+ movapd 32(%r12), %xmm10 // B[4]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[4]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[6]
+
+
+ // unroll 1
+ addpd %xmm14, %xmm3
+ movapd 48(%r12), %xmm14 // B[6]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ addpd %xmm10, %xmm1
+ movapd 64(%r12), %xmm10 // B[8]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 64(%r11), %xmm8 // A[8]
+ mulpd %xmm9, %xmm13
+ movapd 80(%r11), %xmm9 // A[10]
+
+
+ // unroll 2
+ addpd %xmm14, %xmm3
+ movapd 80(%r12), %xmm14 // B[10]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $4, %r10d
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ addpd %xmm10, %xmm1
+ movapd 96(%r12), %xmm10 // B[12]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 96(%r11), %xmm8 // A[12]
+ mulpd %xmm9, %xmm13
+ movapd 112(%r11), %xmm9 // A[14]
+
+
+ // unroll 3
+ addpd %xmm14, %xmm3
+ movapd 112(%r12), %xmm14 // B[14]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addq $128, %r12 // B += 16
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $128, %r11 // A += 16
+
+ addpd %xmm10, %xmm1
+ movapd 0(%r12), %xmm10 // B[0]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ cmpl $4, %r10d
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 0(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+ movapd 16(%r11), %xmm9 // A[2]
+
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ addpd %xmm14, %xmm3
+ movapd 16(%r12), %xmm14 // B[2]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ addpd %xmm10, %xmm1
+ movapd 32(%r12), %xmm10 // B[4]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[4]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[6]
+
+
+ // unroll 1
+ addpd %xmm14, %xmm3
+ movapd 48(%r12), %xmm14 // B[6]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ addpd %xmm10, %xmm1
+ movapd 64(%r12), %xmm10 // B[8]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 64(%r11), %xmm8 // A[8]
+ mulpd %xmm9, %xmm13
+ movapd 80(%r11), %xmm9 // A[10]
+
+
+ // unroll 2
+ addpd %xmm14, %xmm3
+ movapd 80(%r12), %xmm14 // B[10]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $4, %r10d
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ addpd %xmm10, %xmm1
+ movapd 96(%r12), %xmm10 // B[12]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 96(%r11), %xmm8 // A[12]
+ mulpd %xmm9, %xmm13
+ movapd 112(%r11), %xmm9 // A[14]
+
+
+ // unroll 3
+ addpd %xmm14, %xmm3
+ movapd 112(%r12), %xmm14 // B[14]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addq $128, %r12 // B += 16
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $128, %r11 // A += 16
+
+ addpd %xmm10, %xmm1
+// movapd 0(%r12), %xmm10 // B[0]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+// cmpl $4, %r10d
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+// movapd 0(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+// movapd 16(%r11), %xmm9 // A[2]
+
+
+ // clean accumulators
+ addpd %xmm14, %xmm3
+ addpd %xmm11, %xmm7
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+
+ // unroll 0
+ addpd %xmm14, %xmm3
+ movapd 16(%r12), %xmm14 // B[2]
+ addpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $1, %r10d
+
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $32, %r12
+
+ addpd %xmm10, %xmm1
+ movapd 32(%r12), %xmm10 // B[0]
+ addpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addq $32, %r11
+
+ addpd %xmm15, %xmm0
+ addpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[2]
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+ // clean accumulators
+ addpd %xmm14, %xmm3
+ addpd %xmm11, %xmm7
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nt_4x4_lib4, .-inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+4*k*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_sub_nt_4x4_lib4, @function
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_sub_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_sub_nt_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_sub_nt_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+ movapd 0(%r12), %xmm10 // B[0]
+
+ xorpd %xmm11, %xmm11
+ movapd %xmm11, %xmm12
+ movapd %xmm11, %xmm13
+ movapd %xmm11, %xmm14
+ movapd %xmm11, %xmm15
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ // unroll 0
+ subpd %xmm14, %xmm3
+ movapd 16(%r12), %xmm14 // B[2]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ subpd %xmm10, %xmm1
+ movapd 32(%r12), %xmm10 // B[4]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[4]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[6]
+
+
+ // unroll 1
+ subpd %xmm14, %xmm3
+ movapd 48(%r12), %xmm14 // B[6]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ subpd %xmm10, %xmm1
+ movapd 64(%r12), %xmm10 // B[8]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 64(%r11), %xmm8 // A[8]
+ mulpd %xmm9, %xmm13
+ movapd 80(%r11), %xmm9 // A[10]
+
+
+ // unroll 2
+ subpd %xmm14, %xmm3
+ movapd 80(%r12), %xmm14 // B[10]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $4, %r10d
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ subpd %xmm10, %xmm1
+ movapd 96(%r12), %xmm10 // B[12]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 96(%r11), %xmm8 // A[12]
+ mulpd %xmm9, %xmm13
+ movapd 112(%r11), %xmm9 // A[14]
+
+
+ // unroll 3
+ subpd %xmm14, %xmm3
+ movapd 112(%r12), %xmm14 // B[14]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addq $128, %r12 // B += 16
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $128, %r11 // A += 16
+
+ subpd %xmm10, %xmm1
+ movapd 0(%r12), %xmm10 // B[0]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ cmpl $4, %r10d
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 0(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+ movapd 16(%r11), %xmm9 // A[2]
+
+
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ subpd %xmm14, %xmm3
+ movapd 16(%r12), %xmm14 // B[2]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ subpd %xmm10, %xmm1
+ movapd 32(%r12), %xmm10 // B[4]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[4]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[6]
+
+
+ // unroll 1
+ subpd %xmm14, %xmm3
+ movapd 48(%r12), %xmm14 // B[6]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ subpd %xmm10, %xmm1
+ movapd 64(%r12), %xmm10 // B[8]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 64(%r11), %xmm8 // A[8]
+ mulpd %xmm9, %xmm13
+ movapd 80(%r11), %xmm9 // A[10]
+
+
+ // unroll 2
+ subpd %xmm14, %xmm3
+ movapd 80(%r12), %xmm14 // B[10]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $4, %r10d
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ subpd %xmm10, %xmm1
+ movapd 96(%r12), %xmm10 // B[12]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 96(%r11), %xmm8 // A[12]
+ mulpd %xmm9, %xmm13
+ movapd 112(%r11), %xmm9 // A[14]
+
+
+ // unroll 3
+ subpd %xmm14, %xmm3
+ movapd 112(%r12), %xmm14 // B[14]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addq $128, %r12 // B += 16
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $128, %r11 // A += 16
+
+ subpd %xmm10, %xmm1
+// movapd 0(%r12), %xmm10 // B[0]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+// cmpl $4, %r10d
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+// movapd 0(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+// movapd 16(%r11), %xmm9 // A[2]
+
+
+ // update accumulators
+ subpd %xmm14, %xmm3
+ subpd %xmm11, %xmm7
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+
+ // unroll 0
+ subpd %xmm14, %xmm3
+ movapd 16(%r12), %xmm14 // B[2]
+ subpd %xmm11, %xmm7
+ movapd %xmm10, %xmm11
+ pshufd $0x4e, %xmm10, %xmm15
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $1, %r10d
+
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $32, %r12
+
+ subpd %xmm10, %xmm1
+ movapd 32(%r12), %xmm10 // B[0]
+ subpd %xmm11, %xmm5
+ movapd %xmm14, %xmm11
+ pshufd $0x4e, %xmm14, %xmm12
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addq $32, %r11
+
+ subpd %xmm15, %xmm0
+ subpd %xmm13, %xmm4
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[2]
+
+ cmpl $0, %r10d
+
+ jg 3b // clean up loop
+
+
+ // update accumulators
+ subpd %xmm14, %xmm3
+ subpd %xmm11, %xmm7
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_sub_nt_4x4_lib4, .-inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// r13 <- 4*sdb*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- 0
+// r11 <- A+4*k*sizeof(double)
+// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4)
+// r13 <- 4*sdb*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=2
+ .macro INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_kernel_dgemm_add_nn_4x4_lib4, @function
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_kernel_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_kernel_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_kernel_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // prefetch
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ xorpd %xmm11, %xmm11
+ movapd %xmm11, %xmm12
+ movapd %xmm11, %xmm13
+ movapd %xmm11, %xmm14
+ movapd %xmm11, %xmm15
+
+
+ cmpl $4, %r10d
+ jle 0f // consider clean-up loop
+
+ // main loop
+ .p2align 3
+1: // main loop
+
+ prefetcht0 0(%r12, %r13, 2) // software prefetch
+ prefetcht0 64(%r12, %r13, 2) // software prefetch
+
+ // unroll 0
+ movddup 0(%r12), %xmm10 // B[0]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ movddup 32(%r12), %xmm15 // B[4]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 64(%r12), %xmm14 // B[8]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 96(%r12), %xmm12 // B[12]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[4]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[6]
+
+
+ // unroll 1
+ movddup 8(%r12), %xmm10 // B[1]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ movddup 40(%r12), %xmm15 // B[5]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 72(%r12), %xmm14 // B[9]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 104(%r12), %xmm12 // B[13]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 64(%r11), %xmm8 // A[8]
+ mulpd %xmm9, %xmm13
+ movapd 80(%r11), %xmm9 // A[10]
+
+
+ // unroll 2
+ movddup 16(%r12), %xmm10 // B[2]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $4, %r10d
+
+ movddup 48(%r12), %xmm15 // B[6]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 80(%r12), %xmm14 // B[10]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 112(%r12), %xmm12 // B[14]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 96(%r11), %xmm8 // A[12]
+ mulpd %xmm9, %xmm13
+ movapd 112(%r11), %xmm9 // A[14]
+
+
+ // unroll 3
+ movddup 24(%r12), %xmm10 // B[3]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ movddup 56(%r12), %xmm15 // B[7]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $128, %r11 // A += 16
+
+ movddup 88(%r12), %xmm14 // B[11]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 120(%r12), %xmm12 // B[15]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 0(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+ movapd 16(%r11), %xmm9 // A[2]
+ addq %r13, %r12 // B += ...
+
+
+ cmpl $4, %r10d
+ jg 1b // main loop
+
+
+0: // consider clean4-up
+
+ cmpl $3, %r10d
+ jle 4f // clean1
+
+
+ // unroll 0
+ movddup 0(%r12), %xmm10 // B[0]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ movddup 32(%r12), %xmm15 // B[4]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 64(%r12), %xmm14 // B[8]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 96(%r12), %xmm12 // B[12]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 32(%r11), %xmm8 // A[4]
+ mulpd %xmm9, %xmm13
+ movapd 48(%r11), %xmm9 // A[6]
+
+
+ // unroll 1
+ movddup 8(%r12), %xmm10 // B[1]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ movddup 40(%r12), %xmm15 // B[5]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 72(%r12), %xmm14 // B[9]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 104(%r12), %xmm12 // B[13]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 64(%r11), %xmm8 // A[8]
+ mulpd %xmm9, %xmm13
+ movapd 80(%r11), %xmm9 // A[10]
+
+
+ // unroll 2
+ movddup 16(%r12), %xmm10 // B[2]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $4, %r10d
+
+ movddup 48(%r12), %xmm15 // B[6]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 80(%r12), %xmm14 // B[10]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 112(%r12), %xmm12 // B[14]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ movapd 96(%r11), %xmm8 // A[12]
+ mulpd %xmm9, %xmm13
+ movapd 112(%r11), %xmm9 // A[14]
+
+
+ // unroll 3
+ movddup 24(%r12), %xmm10 // B[3]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+
+ movddup 56(%r12), %xmm15 // B[7]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addq $128, %r11 // A += 16
+
+ movddup 88(%r12), %xmm14 // B[11]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+
+ movddup 120(%r12), %xmm12 // B[15]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+// movapd 0(%r11), %xmm8 // A[0]
+ mulpd %xmm9, %xmm13
+// movapd 16(%r11), %xmm9 // A[2]
+ addq %r13, %r12 // B += ...
+
+
+ // clean accumulators
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+
+ jmp 2f
+
+
+4: // consider clean1-up loop
+
+ cmpl $0, %r10d
+ jle 2f // return
+
+ // clean-up loop
+3: // clean up loop
+
+
+ // unroll 0
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ subl $1, %r10d
+
+ movddup 32(%r12), %xmm15 // B[4]
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+
+ movddup 64(%r12), %xmm14 // B[8]
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addq $32, %r11
+
+ movddup 96(%r12), %xmm12 // B[12]
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addq $8, %r12
+
+ cmpl $0, %r10d
+ jg 3b // clean up loop
+
+
+ // clean accumulators
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+
+2: // return
+
+#if MACRO_LEVEL>=2
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_kernel_dgemm_add_nn_4x4_lib4, .-inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B unaligned
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dgemm_add_nn_4x4_lib4, @function
+inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dgemm_add_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dgemm_add_nn_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dgemm_add_nn_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d // offset==0
+ jle 2f // end
+
+ cmpl $0, %r10d // k==0
+ jle 2f // end
+
+ movl $4, %r15d
+ subl %r14d, %r15d // 4-offsetB
+ cmpl %r10d, %r15d
+// jle 0f
+// movl %r10d, %r15d // kend=min(k,4-offsetB)
+//0:
+ cmovgl %r10d, %r15d // kend=min(k,4-offsetB)
+
+ movl %r14d, %eax
+ sall $3, %eax // offsetB*sizeof(double)
+ addq %rax, %r12 // B+offsetB*sizeof(double)
+
+1:
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 32(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 64(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 96(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ subl $1, %r10d // k-1
+ subl $1, %r15d // kend-1
+ addq $32, %r11 // A+1*bs*sizeof(float)
+ addq $8, %r12 // B+1*sizeof(float)
+
+ cmpl $0, %r15d
+ jg 1b
+
+ cmpl $0, %r10d
+ jle 2f // end
+
+ addq %r13, %r12
+ subq $32, %r12 // B+bs*(sdb-1)*sizeof(double)
+
+2:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dgemm_add_nn_4x4_lib4, .-inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B lower triangular
+//
+// input arguments:
+// r10 <- k
+// r11 <- A
+// r12 <- B
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+//
+// output arguments:
+// r10 <- k-(4-offB)
+// r11 <- A+(4-offB)*bs*sizeof(double)
+// r12 <- B-offB+bs*sdb*sizeof(double)
+// r13 <- bs*sdb*sizeof(double)
+// r14 <- offB
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// ymm8 <- dirty
+// ymm12 <- dirty
+// ymm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nn_rl_4x4_lib4, @function
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nn_rl_4x4_lib4:
+#endif
+#endif
+
+ cmpl $0, %r14d
+ jg 0f
+
+ // offB==0
+
+ // unroll 0
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ // unroll 1
+ movapd 32(%r11), %xmm8 // A[0]
+ movapd 48(%r11), %xmm9 // A[2]
+
+ movddup 8(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 40(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ // unroll 2
+ movapd 64(%r11), %xmm8 // A[0]
+ movapd 80(%r11), %xmm9 // A[2]
+
+ movddup 16(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 48(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 80(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ // unroll 3
+ movapd 96(%r11), %xmm8 // A[0]
+ movapd 112(%r11), %xmm9 // A[2]
+
+ movddup 24(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 56(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 88(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 120(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+0:
+ cmpl $1, %r14d
+ jg 1f
+
+ // offB==1
+
+ addq $8, %r12 // B+1*sizeof(double)
+
+ // unroll 0
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ // unroll 1
+ movapd 32(%r11), %xmm8 // A[0]
+ movapd 48(%r11), %xmm9 // A[2]
+
+ movddup 8(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 40(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ // unroll 2
+ movapd 64(%r11), %xmm8 // A[0]
+ movapd 80(%r11), %xmm9 // A[2]
+
+ movddup 16(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 48(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 80(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ subl $3, %r10d // k-3
+ addq $96, %r11 // A+3*bs*sizeof(double)
+ addq %r13, %r12
+ subq $8, %r12 // B+bs*sdb*sizeof(double)-1
+
+ jmp 3f
+
+1:
+ cmpl $2, %r14d
+ jg 2f
+
+ // offB==2
+
+ addq $16, %r12 // B+2*sizeof(double)
+
+ // unroll 0
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ // unroll 1
+ movapd 32(%r11), %xmm8 // A[0]
+ movapd 48(%r11), %xmm9 // A[2]
+
+ movddup 8(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 40(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ subl $2, %r10d // k-2
+ addq $64, %r11 // A+2*bs*sizeof(double)
+ addq %r13, %r12
+ subq $16, %r12 // B+bs*sdb*sizeof(double)-2
+
+ // unroll 2
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 32(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 64(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ // unroll 3
+ movapd 32(%r11), %xmm8 // A[0]
+ movapd 48(%r11), %xmm9 // A[2]
+
+ movddup 8(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 40(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 72(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 104(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ // unroll 4
+ movapd 64(%r11), %xmm8 // A[0]
+ movapd 80(%r11), %xmm9 // A[2]
+
+ movddup 16(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 48(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 80(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 112(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ // unroll 5
+ movapd 96(%r11), %xmm8 // A[0]
+ movapd 112(%r11), %xmm9 // A[2]
+
+ movddup 24(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 56(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 88(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 120(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+ jmp 3f
+
+2:
+ // offB==3
+
+ addq $24, %r12 // B+3*sizeof(double)
+
+ // unroll 0
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ subl $1, %r10d // k-1
+ addq $32, %r11 // A+1*bs*sizeof(double)
+ addq %r13, %r12
+ subq $24, %r12 // B+bs*sdb*sizeof(double)-3
+
+ // unroll 1
+ movapd 0(%r11), %xmm8 // A[0]
+ movapd 16(%r11), %xmm9 // A[2]
+
+ movddup 0(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 32(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ // unroll 2
+ movapd 32(%r11), %xmm8 // A[0]
+ movapd 48(%r11), %xmm9 // A[2]
+
+ movddup 8(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 40(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 72(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ // unroll 3
+ movapd 64(%r11), %xmm8 // A[0]
+ movapd 80(%r11), %xmm9 // A[2]
+
+ movddup 16(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 48(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 80(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 112(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ // unroll 4
+ movapd 96(%r11), %xmm8 // A[0]
+ movapd 112(%r11), %xmm9 // A[2]
+
+ movddup 24(%r12), %xmm10 // B[0]
+ movapd %xmm10, %xmm11
+ mulpd %xmm8, %xmm10
+ mulpd %xmm9, %xmm11
+ addpd %xmm10, %xmm0
+ addpd %xmm11, %xmm4
+
+ movddup 56(%r12), %xmm15 // B[4]
+ movapd %xmm15, %xmm13
+ mulpd %xmm8, %xmm15
+ mulpd %xmm9, %xmm13
+ addpd %xmm15, %xmm1
+ addpd %xmm13, %xmm5
+
+ movddup 88(%r12), %xmm14 // B[8]
+ movapd %xmm14, %xmm11
+ mulpd %xmm8, %xmm14
+ mulpd %xmm9, %xmm11
+ addpd %xmm14, %xmm2
+ addpd %xmm11, %xmm6
+
+ movddup 120(%r12), %xmm12 // B[12]
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ subl $4, %r10d // k-4
+ addq $128, %r11 // A+4*bs*sizeof(double)
+ addq %r13, %r12 // B+bs*sdb*sizeof(double)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nn_rl_4x4_lib4, .-inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10 <- A
+// r11 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10 <- A+4*4*sizeof(double)
+// r11 <- B+4*4*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_lib4:
+#endif
+#endif
+
+ movapd 0(%r10), %xmm8
+ movapd 16(%r10), %xmm9
+ movddup 0(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+
+ movapd 32(%r10), %xmm8
+ movapd 48(%r10), %xmm9
+ movddup 32(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ movddup 40(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm1
+ addpd %xmm13, %xmm5
+
+ movapd 64(%r10), %xmm8
+ movapd 80(%r10), %xmm9
+ movddup 64(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ movddup 72(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm1
+ addpd %xmm13, %xmm5
+ movddup 80(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+
+ movapd 96(%r10), %xmm8
+ movapd 112(%r10), %xmm9
+ movddup 96(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ movddup 104(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm1
+ addpd %xmm13, %xmm5
+ movddup 112(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ movddup 120(%r11), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+
+ addq $128, %r10
+ addq $128, %r11
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_lib4, .-inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// edge for B upper triangular
+//
+// input arguments:
+// r10d <- k
+// r11 <- A
+// r12 <- B
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+//
+// output arguments:
+// r10d <- max(k-4,0)
+// r11 <- A+4*4*sizeof(double)
+// r12 <- B+4*4*sizeof(double)
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrmm_nt_ru_4x4_vs_lib4, @function
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+#endif
+
+ movapd 0(%r11), %xmm8
+ movapd 16(%r11), %xmm9
+ subl $1, %r10d
+ movddup 0(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ addq $32, %r11
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ movapd 0(%r11), %xmm8
+ movapd 16(%r11), %xmm9
+ subl $1, %r10d
+ movddup 0(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ addq $32, %r11
+ movddup 8(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm1
+ addpd %xmm13, %xmm5
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ movapd 0(%r11), %xmm8
+ movapd 16(%r11), %xmm9
+ subl $1, %r10d
+ movddup 0(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ movddup 8(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm1
+ addpd %xmm13, %xmm5
+ addq $32, %r11
+ movddup 16(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ addq $32, %r12
+
+ cmpl $0, %r10d
+ jle 0f
+
+ movapd 0(%r11), %xmm8
+ movapd 16(%r11), %xmm9
+ subl $1, %r10d
+ movddup 0(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm0
+ addpd %xmm13, %xmm4
+ movddup 8(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm1
+ addpd %xmm13, %xmm5
+ movddup 16(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm2
+ addpd %xmm13, %xmm6
+ addq $32, %r11
+ movddup 24(%r12), %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm8, %xmm12
+ mulpd %xmm9, %xmm13
+ addpd %xmm12, %xmm3
+ addpd %xmm13, %xmm7
+ addq $32, %r12
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrmm_nt_ru_4x4_vs_lib4, .-inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blend
+//
+// input arguments:
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_4x4_lib4, @function
+inner_blend_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_4x4_lib4:
+#endif
+#endif
+
+ movapd %xmm0, %xmm8
+ movsd %xmm1, %xmm0
+ movsd %xmm8, %xmm1
+
+ movapd %xmm2, %xmm8
+ movsd %xmm3, %xmm2
+ movsd %xmm8, %xmm3
+
+ movapd %xmm4, %xmm8
+ movsd %xmm5, %xmm4
+ movsd %xmm8, %xmm5
+
+ movapd %xmm6, %xmm8
+ movsd %xmm7, %xmm6
+ movsd %xmm8, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_4x4_lib4, .-inner_blend_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_ab_4x4_lib4, @function
+inner_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ movddup 0(%r10), %xmm15
+
+ mulpd %xmm15, %xmm0
+ mulpd %xmm15, %xmm1
+ mulpd %xmm15, %xmm2
+ mulpd %xmm15, %xmm3
+ mulpd %xmm15, %xmm4
+ mulpd %xmm15, %xmm5
+ mulpd %xmm15, %xmm6
+ mulpd %xmm15, %xmm7
+
+
+ // beta
+ movddup 0(%r11), %xmm14
+
+ movapd 0(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm0
+ movapd 16(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm4
+ movapd 32(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm1
+ movapd 48(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm5
+ movapd 64(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm2
+ movapd 80(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm6
+ movapd 96(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm3
+ movapd 112(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta=0.0
+//
+// input arguments:
+// r10 <- alpha
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- alpha
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_SCALE_A0_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_scale_a0_4x4_lib4, @function
+inner_scale_a0_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_scale_a0_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_scale_a0_4x4_lib4; .scl 2; .type 32; .endef
+inner_scale_a0_4x4_lib4:
+#endif
+#endif
+
+ // alpha
+ movddup 0(%r10), %xmm15
+
+ mulpd %xmm15, %xmm0
+ mulpd %xmm15, %xmm1
+ mulpd %xmm15, %xmm2
+ mulpd %xmm15, %xmm3
+ mulpd %xmm15, %xmm4
+ mulpd %xmm15, %xmm5
+ mulpd %xmm15, %xmm6
+ mulpd %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_scale_a0_4x4_lib4, .-inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// scale for generic alpha and beta
+//
+// input arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- alpha
+// r11 <- beta
+// r12 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_ab_4x4_lib4, @function
+inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_ab_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_ab_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_ab_4x4_lib4:
+#endif
+#endif
+
+ movapd %xmm0, %xmm8
+ movsd %xmm1, %xmm0
+ movsd %xmm8, %xmm1
+
+ movapd %xmm2, %xmm8
+ movsd %xmm3, %xmm2
+ movsd %xmm8, %xmm3
+
+ movapd %xmm4, %xmm8
+ movsd %xmm5, %xmm4
+ movsd %xmm8, %xmm5
+
+ movapd %xmm6, %xmm8
+ movsd %xmm7, %xmm6
+ movsd %xmm8, %xmm7
+
+ // alpha
+ movddup 0(%r10), %xmm15
+
+ mulpd %xmm15, %xmm0
+ mulpd %xmm15, %xmm1
+ mulpd %xmm15, %xmm2
+ mulpd %xmm15, %xmm3
+ mulpd %xmm15, %xmm4
+ mulpd %xmm15, %xmm5
+ mulpd %xmm15, %xmm6
+ mulpd %xmm15, %xmm7
+
+
+ // beta
+ movddup 0(%r11), %xmm14
+
+ movapd 0(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm0
+ movapd 16(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm4
+ movapd 32(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm1
+ movapd 48(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm5
+ movapd 64(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm2
+ movapd 80(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm6
+ movapd 96(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm3
+ movapd 112(%r12), %xmm15
+ mulpd %xmm14, %xmm15
+ addpd %xmm15, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_blend_scale_ab_4x4_lib4, .-inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// blender for alpha = 1.0 and beta = 1.0
+//
+// input arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+// output arguments:
+// r10 <- C
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_blend_scale_11_4x4_lib4, @function
+inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_blend_scale_11_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_blend_scale_11_4x4_lib4; .scl 2; .type 32; .endef
+inner_blend_scale_11_4x4_lib4:
+#endif
+#endif
+
+ movapd %xmm0, %xmm8
+ movsd %xmm1, %xmm0
+ movsd %xmm8, %xmm1
+
+ movapd %xmm2, %xmm8
+ movsd %xmm3, %xmm2
+ movsd %xmm8, %xmm3
+
+ movapd %xmm4, %xmm8
+ movsd %xmm5, %xmm4
+ movsd %xmm8, %xmm5
+
+ movapd %xmm6, %xmm8
+ movsd %xmm7, %xmm6
+ movsd %xmm8, %xmm7
+
+
+ movapd 0(%r10), %xmm15
+ addpd %xmm15, %xmm0
+ movapd 16(%r10), %xmm15
+ addpd %xmm15, %xmm4
+ movapd 32(%r10), %xmm15
+ addpd %xmm15, %xmm1
+ movapd 48(%r10), %xmm15
+ addpd %xmm15, %xmm5
+ movapd 64(%r10), %xmm15
+ addpd %xmm15, %xmm2
+ movapd 80(%r10), %xmm15
+ addpd %xmm15, %xmm6
+ movapd 96(%r10), %xmm15
+ addpd %xmm15, %xmm3
+ movapd 112(%r10), %xmm15
+ addpd %xmm15, %xmm7
+
+ ret
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+#if defined(OS_LINUX)
+ .size inner_blend_scale_11_4x4_lib4, .-inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// cholesky factorization
+//
+// input arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- inv_diag_E
+// r11d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dpotrf_4x4_vs_lib4, @function
+inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dpotrf_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dpotrf_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dpotrf_4x4_vs_lib4:
+#endif
+#endif
+
+ xorpd %xmm15, %xmm15 // 0.0
+
+ movsd %xmm0, %xmm13
+ ucomisd %xmm15, %xmm13 // d_00 > 0.0 ?
+ jbe 1f
+ sqrtsd %xmm13, %xmm13
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ movsd .LC04(%rip), %xmm12 // 1.0
+#elif defined(OS_MAC)
+ movsd LC04(%rip), %xmm12 // 1.0
+#endif
+ divsd %xmm13, %xmm12
+2:
+ cmpl $2, %r11d
+ movsd %xmm12, 0(%r10)
+ movddup %xmm12, %xmm12
+ mulpd %xmm12, %xmm0
+ mulpd %xmm12, %xmm4
+
+ jl 0f // ret
+
+ movapd %xmm0, %xmm12
+ shufpd $0x3, %xmm12, %xmm12
+ movapd %xmm12, %xmm13
+ mulpd %xmm0, %xmm12
+ mulpd %xmm4, %xmm13
+ subpd %xmm12, %xmm1
+ subpd %xmm13, %xmm5
+ movapd %xmm1, %xmm13
+ shufpd $0x3, %xmm13, %xmm13 // 0x1 ???
+ ucomisd %xmm15, %xmm13 // d_11 > 0.0 ?
+ jbe 3f
+ sqrtsd %xmm13, %xmm13
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ movsd .LC04(%rip), %xmm12 // 1.0
+#elif defined(OS_MAC)
+ movsd LC04(%rip), %xmm12 // 1.0
+#endif
+ divsd %xmm13, %xmm12
+4:
+ cmpl $3, %r11d
+ movsd %xmm12, 8(%r10)
+ movddup %xmm12, %xmm12
+ mulpd %xmm12, %xmm1
+ mulpd %xmm12, %xmm5
+
+ jl 0f // ret
+
+ movddup %xmm4, %xmm12
+ movddup %xmm5, %xmm13
+ mulpd %xmm4, %xmm12
+ mulpd %xmm5, %xmm13
+ subpd %xmm12, %xmm6
+ subpd %xmm13, %xmm6
+ movsd %xmm6, %xmm13
+ ucomisd %xmm15, %xmm13 // d_22 > 0.0 ?
+ jbe 5f
+ sqrtsd %xmm13, %xmm13
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ movsd .LC04(%rip), %xmm12 // 1.0
+#elif defined(OS_MAC)
+ movsd LC04(%rip), %xmm12 // 1.0
+#endif
+ divsd %xmm13, %xmm12
+6:
+ cmpl $4, %r11d
+ movsd %xmm12, 16(%r10)
+ movddup %xmm12, %xmm12
+ mulpd %xmm12, %xmm6
+
+ jl 0f // ret
+
+ movapd %xmm4, %xmm12
+ movapd %xmm5, %xmm13
+ movapd %xmm6, %xmm14
+ shufpd $0x3, %xmm12, %xmm12
+ shufpd $0x3, %xmm13, %xmm13
+ shufpd $0x3, %xmm14, %xmm14
+ mulpd %xmm4, %xmm12
+ mulpd %xmm5, %xmm13
+ mulpd %xmm6, %xmm14
+ subpd %xmm12, %xmm7
+ subpd %xmm13, %xmm7
+ subpd %xmm14, %xmm7
+ movapd %xmm7, %xmm13
+ shufpd $0x3, %xmm13, %xmm13
+ ucomisd %xmm15, %xmm13 // d_33 > 0.0 ?
+ jbe 7f
+ sqrtsd %xmm13, %xmm13
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ movsd .LC04(%rip), %xmm12 // 1.0
+#elif defined(OS_MAC)
+ movsd LC04(%rip), %xmm12 // 1.0
+#endif
+ divsd %xmm13, %xmm12
+8:
+ movsd %xmm12, 24(%r10)
+ movddup %xmm12, %xmm12
+ mulpd %xmm12, %xmm7
+
+ jmp 0f
+
+1:
+ xorpd %xmm12, %xmm12
+ jmp 2b
+
+3:
+ xorpd %xmm12, %xmm12
+ jmp 4b
+
+5:
+ xorpd %xmm12, %xmm12
+ jmp 6b
+
+7:
+ xorpd %xmm12, %xmm12
+ jmp 8b
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dpotrf_4x4_vs_lib4, .-inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- E
+// r11 <- inv_diag_E
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_lib4:
+#endif
+#endif
+
+ movddup 0(%r11), %xmm13
+ mulpd %xmm13, %xmm0
+ mulpd %xmm13, %xmm4
+
+ movddup 8(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm0, %xmm13
+ mulpd %xmm4, %xmm12
+ subpd %xmm13, %xmm1
+ subpd %xmm12, %xmm5
+ movddup 8(%r11), %xmm13
+ mulpd %xmm13, %xmm1
+ mulpd %xmm13, %xmm5
+
+ movddup 16(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm0, %xmm12
+ mulpd %xmm4, %xmm13
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movddup 48(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm1, %xmm12
+ mulpd %xmm5, %xmm13
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movddup 16(%r11), %xmm13
+ mulpd %xmm13, %xmm2
+ mulpd %xmm13, %xmm6
+
+ movddup 24(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm0, %xmm12
+ mulpd %xmm4, %xmm13
+ subpd %xmm12, %xmm3
+ subpd %xmm13, %xmm7
+ movddup 56(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm1, %xmm12
+ mulpd %xmm5, %xmm13
+ subpd %xmm12, %xmm3
+ subpd %xmm13, %xmm7
+ movddup 88(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm2, %xmm12
+ mulpd %xmm6, %xmm13
+ subpd %xmm12, %xmm3
+ subpd %xmm13, %xmm7
+ movddup 24(%r11), %xmm13
+ mulpd %xmm13, %xmm3
+ mulpd %xmm13, %xmm7
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// triangular substitution for cholesky factorization
+//
+// input arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11 <- inv_diag_D
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, @function
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_edge_dtrsm_rlt_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_edge_dtrsm_rlt_inv_4x4_vs_lib4:
+#endif
+#endif
+
+ movddup 0(%r11), %xmm13
+ cmpl $2, %r12d
+ mulpd %xmm13, %xmm0
+ mulpd %xmm13, %xmm4
+
+ jl 0f // ret
+
+ movddup 8(%r10), %xmm13
+ cmpl $3, %r12d
+ movapd %xmm13, %xmm12
+ mulpd %xmm0, %xmm13
+ mulpd %xmm4, %xmm12
+ subpd %xmm13, %xmm1
+ subpd %xmm12, %xmm5
+ movddup 8(%r11), %xmm13
+ mulpd %xmm13, %xmm1
+ mulpd %xmm13, %xmm5
+
+ jl 0f // ret
+
+ movddup 16(%r10), %xmm13
+ cmpl $4, %r12d
+ movapd %xmm13, %xmm12
+ mulpd %xmm0, %xmm12
+ mulpd %xmm4, %xmm13
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movddup 48(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm1, %xmm12
+ mulpd %xmm5, %xmm13
+ subpd %xmm12, %xmm2
+ subpd %xmm13, %xmm6
+ movddup 16(%r11), %xmm13
+ mulpd %xmm13, %xmm2
+ mulpd %xmm13, %xmm6
+
+ jl 0f // ret
+
+ movddup 24(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm0, %xmm12
+ mulpd %xmm4, %xmm13
+ subpd %xmm12, %xmm3
+ subpd %xmm13, %xmm7
+ movddup 56(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm1, %xmm12
+ mulpd %xmm5, %xmm13
+ subpd %xmm12, %xmm3
+ subpd %xmm13, %xmm7
+ movddup 88(%r10), %xmm13
+ movapd %xmm13, %xmm12
+ mulpd %xmm2, %xmm12
+ mulpd %xmm6, %xmm13
+ subpd %xmm12, %xmm3
+ subpd %xmm13, %xmm7
+ movddup 24(%r11), %xmm13
+ mulpd %xmm13, %xmm3
+ mulpd %xmm13, %xmm7
+
+0:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_edge_dtrsm_rlt_inv_4x4_vs_lib4, .-inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n
+//
+// input arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d01 d11]
+// xmm2 <- [d02 d12]
+// xmm3 <- [d03 d13]
+// xmm4 <- [d20 d30]
+// xmm5 <- [d21 d31]
+// xmm6 <- [d22 d32]
+// xmm7 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_lib4, @function
+inner_store_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_lib4:
+#endif
+#endif
+
+ movapd %xmm0, 0(%r10)
+ movapd %xmm4, 16(%r10)
+ movapd %xmm1, 32(%r10)
+ movapd %xmm5, 48(%r10)
+ movapd %xmm2, 64(%r10)
+ movapd %xmm6, 80(%r10)
+ movapd %xmm3, 96(%r10)
+ movapd %xmm7, 112(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_lib4, .-inner_store_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_vs_lib4, @function
+inner_store_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+ jg 1f
+ je 0f
+
+ // km==1
+ movsd %xmm0, 0(%r10)
+ cmpl $2, %r12d
+ jl 4f // end
+ movsd %xmm1, 32(%r10)
+ cmpl $3, %r12d
+ jl 4f // end
+ movsd %xmm2, 64(%r10)
+ je 4f // end
+ movsd %xmm3, 96(%r10)
+
+ jmp 4f
+
+0:
+ // km==2
+ movapd %xmm0, 0(%r10)
+ cmpl $2, %r12d
+ jl 4f // end
+ movapd %xmm1, 32(%r10)
+ cmpl $3, %r12d
+ jl 4f // end
+ movapd %xmm2, 64(%r10)
+ je 4f // end
+ movapd %xmm3, 96(%r10)
+
+ jmp 4f
+
+1:
+ cmpl $3, %r11d
+ jg 2f
+
+ // km==3
+ movapd %xmm0, 0(%r10)
+ movsd %xmm4, 16(%r10)
+ cmpl $2, %r12d
+ jl 4f // end
+ movapd %xmm1, 32(%r10)
+ movsd %xmm5, 48(%r10)
+ cmpl $3, %r12d
+ jl 4f // end
+ movapd %xmm2, 64(%r10)
+ movsd %xmm6, 80(%r10)
+ je 4f // end
+ movapd %xmm3, 96(%r10)
+ movsd %xmm7, 112(%r10)
+
+ jmp 4f
+
+2:
+ // km==4
+ movapd %xmm0, 0(%r10)
+ movapd %xmm4, 16(%r10)
+ cmpl $2, %r12d
+ jl 4f // end
+ movapd %xmm1, 32(%r10)
+ movapd %xmm5, 48(%r10)
+ cmpl $3, %r12d
+ jl 4f // end
+ movapd %xmm2, 64(%r10)
+ movapd %xmm6, 80(%r10)
+ je 4f // end
+ movapd %xmm3, 96(%r10)
+ movapd %xmm7, 112(%r10)
+
+4:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_vs_lib4, .-inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n generalized
+//
+// input arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n0 // col index: start from (inc)
+// rax <- n1 // col index: up to (exc)
+// rbx <- dirty
+// xmm0 <-
+//
+// output arguments:
+// r10 <- offset
+// r11 <- D
+// r12 <- 4*sdd*sizeof(double)
+// r13 <- m0 // row index: start from (inc)
+// r14 <- m1 // row index: up to (exc)
+// r15 <- n1-n0
+// rax <- n1-n0
+// rbx <- dirty
+// xmm0 <-
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_4X4_GEN_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_4x4_gen_lib4, @function
+inner_store_4x4_gen_lib4:
+#elif defined(OS_MAC)
+_inner_store_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_4x4_gen_lib4; .scl 2; .type 32; .endef
+inner_store_4x4_gen_lib4:
+#endif
+#endif
+
+ // masks computation ???
+
+ // shift D and sol for cols
+ cmpl $0, %r15d
+ jle 0f
+
+ vmovapd %xmm1, %xmm0
+ vmovapd %xmm5, %xmm4
+ vmovapd %xmm2, %xmm1
+ vmovapd %xmm6, %xmm5
+ vmovapd %xmm3, %xmm2
+ vmovapd %xmm7, %xmm6
+ addq $32, %r11
+
+ cmpl $1, %r15d
+ jle 0f
+
+ vmovapd %xmm1, %xmm0
+ vmovapd %xmm5, %xmm4
+ vmovapd %xmm2, %xmm1
+ vmovapd %xmm6, %xmm5
+ addq $32, %r11
+
+ cmpl $2, %r15d
+ jle 0f
+
+ vmovapd %xmm1, %xmm0
+ vmovapd %xmm5, %xmm4
+ addq $32, %r11
+
+0:
+
+ // compute number of cols
+ cmpl $4, %eax
+ jle 0f
+ movl $4, %eax
+0:
+ subl %r15d, %eax
+ movl %eax, %r15d
+
+
+ cmpl $0, %r10d
+ jg 0f
+
+ ///////////////
+ // offset==0 //
+ ///////////////
+
+ cmpl $0, %r13d
+ jle 4f
+
+ cmpl $1, %r13d
+ jg 5f
+
+ movsd 0(%r11), %xmm8
+ movsd %xmm8, %xmm0
+ movsd 32(%r11), %xmm8
+ movsd %xmm8, %xmm1
+ movsd 64(%r11), %xmm8
+ movsd %xmm8, %xmm2
+ movsd 96(%r11), %xmm8
+ movsd %xmm8, %xmm3
+
+ jmp 4f
+
+5:
+
+ cmpl $2, %r13d
+ jg 5f
+
+ movapd 0(%r11), %xmm0
+ movapd 32(%r11), %xmm1
+ movapd 64(%r11), %xmm2
+ movapd 96(%r11), %xmm3
+
+ jmp 4f
+
+5:
+
+ cmpl $3, %r13d
+ jg 5f
+
+ movapd 0(%r11), %xmm0
+ movsd 16(%r11), %xmm8
+ movsd %xmm8, %xmm4
+ movapd 32(%r11), %xmm1
+ movsd 48(%r11), %xmm8
+ movsd %xmm8, %xmm5
+ movapd 64(%r11), %xmm2
+ movsd 80(%r11), %xmm8
+ movsd %xmm8, %xmm6
+ movapd 96(%r11), %xmm3
+ movsd 112(%r11), %xmm8
+ movsd %xmm8, %xmm7
+
+ jmp 4f
+
+5:
+
+ movapd 0(%r11), %xmm0
+ movapd 16(%r11), %xmm4
+ movapd 32(%r11), %xmm1
+ movapd 48(%r11), %xmm5
+ movapd 64(%r11), %xmm2
+ movapd 80(%r11), %xmm6
+ movapd 96(%r11), %xmm3
+ movapd 112(%r11), %xmm7
+
+4:
+ cmpl $2, %r14d
+ jg 5f
+ je 4f
+
+ // km==1
+ movsd %xmm0, 0(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ movsd %xmm1, 32(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ movsd %xmm2, 64(%r11)
+ je 3f // end
+ movsd %xmm3, 96(%r11)
+
+ jmp 3f
+
+4:
+ // km==2
+ movapd %xmm0, 0(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ movapd %xmm1, 32(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ movapd %xmm2, 64(%r11)
+ je 3f // end
+ movapd %xmm3, 96(%r11)
+
+ jmp 3f
+
+5:
+ cmpl $3, %r14d
+ jg 6f
+
+ // km==3
+ movapd %xmm0, 0(%r11)
+ movsd %xmm4, 16(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ movapd %xmm1, 32(%r11)
+ movsd %xmm5, 48(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ movapd %xmm2, 64(%r11)
+ movsd %xmm6, 80(%r11)
+ je 3f // end
+ movapd %xmm3, 96(%r11)
+ movsd %xmm7, 112(%r11)
+
+ jmp 3f
+
+6:
+ // km==4
+ movapd %xmm0, 0(%r11)
+ movapd %xmm4, 16(%r11)
+ cmpl $2, %r15d
+ jl 3f // end
+ movapd %xmm1, 32(%r11)
+ movapd %xmm5, 48(%r11)
+ cmpl $3, %r15d
+ jl 3f // end
+ movapd %xmm2, 64(%r11)
+ movapd %xmm6, 80(%r11)
+ je 3f // end
+ movapd %xmm3, 96(%r11)
+ movapd %xmm7, 112(%r11)
+
+ jmp 3f
+
+0:
+
+ movq %r11, %rbx // D0
+ addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
+
+ cmpl $1, %r10d
+ jg 1f
+
+ ///////////////
+ // offset==1 //
+ ///////////////
+
+ // TODO
+
+ jmp 3f
+
+1:
+
+ cmpl $2, %r10d
+ jg 2f
+
+ ///////////////
+ // offset==2 //
+ ///////////////
+
+ // TODO
+
+ jmp 3f
+
+2:
+
+ ///////////////
+ // offset==3 //
+ ///////////////
+
+ // TODO
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_4x4_gen_lib4, .-inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n lower triangular
+//
+// input arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_lib4, @function
+inner_store_l_4x4_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_lib4:
+#endif
+#endif
+
+ movapd %xmm0, 0(%r10)
+ movapd %xmm4, 16(%r10)
+ movsd 32(%r10), %xmm15
+ movsd %xmm15, %xmm1
+ movapd %xmm1, 32(%r10)
+ movapd %xmm5, 48(%r10)
+// movapd %xmm2, 64(%r10)
+ movapd %xmm6, 80(%r10)
+// movapd %xmm3, 96(%r10)
+ movsd 112(%r10), %xmm15
+ movsd %xmm15, %xmm7
+ movapd %xmm7, 112(%r10)
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_lib4, .-inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+
+
+// common inner routine with file scope
+//
+// store n vs lower triangular
+//
+// input arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+//
+// output arguments:
+// r10 <- D
+// r11d <- km
+// r12d <- kn
+// xmm0 <- [d00 d10]
+// xmm1 <- [d20 d30]
+// xmm2 <- [d01 d11]
+// xmm3 <- [d21 d31]
+// xmm0 <- [d02 d12]
+// xmm1 <- [d22 d32]
+// xmm2 <- [d03 d13]
+// xmm3 <- [d23 d33]
+// xmm8 <- dirty
+// xmm9 <- dirty
+// xmm10 <- dirty
+// xmm11 <- dirty
+// xmm12 <- dirty
+// xmm13 <- dirty
+// xmm14 <- dirty
+// xmm15 <- dirty
+
+#if MACRO_LEVEL>=1
+ .macro INNER_STORE_L_4X4_VS_LIB4
+#else
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .type inner_store_l_4x4_vs_lib4, @function
+inner_store_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+_inner_store_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .def inner_store_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+inner_store_l_4x4_vs_lib4:
+#endif
+#endif
+
+ cmpl $2, %r11d
+ jg 1f
+ je 0f
+
+ // km==1
+ movsd %xmm0, 0(%r10)
+
+ jmp 3f
+
+0:
+ // km==2
+ cmpl $2, %r12d
+ movapd %xmm0, 0(%r10)
+ jl 3f // end
+ movsd 32(%r10), %xmm15
+ movsd %xmm15, %xmm1
+ movapd %xmm1, 32(%r10)
+
+ jmp 3f
+
+1:
+ cmpl $3, %r11d
+ jg 2f
+
+ // km==3
+ cmpl $2, %r12d
+ movapd %xmm0, 0(%r10)
+ movsd %xmm4, 16(%r10)
+ jl 3f // end
+ cmpl $3, %r12d
+ movsd 32(%r10), %xmm15
+ movsd %xmm15, %xmm1
+ movapd %xmm1, 32(%r10)
+ movsd %xmm5, 48(%r10)
+ jl 3f // end
+// movapd %xmm2, 64(%r10)
+ movsd %xmm6, 80(%r10)
+
+ jmp 3f
+
+2:
+ // km==3
+ cmpl $2, %r12d
+ movapd %xmm0, 0(%r10)
+ movapd %xmm4, 16(%r10)
+ jl 3f // end
+ cmpl $3, %r12d
+ movsd 32(%r10), %xmm15
+ movsd %xmm15, %xmm1
+ movapd %xmm1, 32(%r10)
+ movapd %xmm5, 48(%r10)
+ jl 3f // end
+// movapd %xmm2, 64(%r10)
+ movapd %xmm6, 80(%r10)
+ je 3f // end
+// movapd %xmm3, 96(%r10)
+ movsd 112(%r10), %xmm15
+ movsd %xmm15, %xmm7
+ movapd %xmm7, 112(%r10)
+
+3:
+
+#if MACRO_LEVEL>=1
+ .endm
+#else
+ ret
+
+#if defined(OS_LINUX)
+ .size inner_store_l_4x4_vs_lib4, .-inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .type kernel_dgemm_nt_4x4_lib4, @function
+kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_lib4
+_kernel_dgemm_nt_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_lib4
+ .def kernel_dgemm_nt_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_lib4, .-kernel_dgemm_nt_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .type kernel_dgemm_nt_4x4_vs_lib4, @function
+kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_vs_lib4
+_kernel_dgemm_nt_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_vs_lib4
+ .def kernel_dgemm_nt_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_vs_lib4, .-kernel_dgemm_nt_4x4_vs_lib4
+#endif
+
+
+
+
+
+#if 0
+
+// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// void kernel_dgemm_nt_4x4_gen_lib4(int k, double *alpha, double *A, double *B, double *beta, int offsetC, double *C, int sdc, int offsetD, double *D, int sdd, int m0, int m1, int n0, int n1);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .type kernel_dgemm_nt_4x4_gen_lib4, @function
+kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nt_4x4_gen_lib4
+_kernel_dgemm_nt_4x4_gen_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nt_4x4_gen_lib4
+ .def kernel_dgemm_nt_4x4_gen_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nt_4x4_gen_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+#if 0 //
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // offsetC
+ movq ARG7, %r13 // C
+ movq ARG8, %r14 // sdc
+ sall $5, %r14d // 4*sdc*sizeof(double)
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_gen_lib4
+#endif
+#endif
+
+#else //
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG7, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+#endif //
+
+ // store n gen
+
+ movq ARG9, %r10 // offsetD
+ movq ARG10, %r11 // D
+ movq ARG11, %r12 // sdd
+ sall $5, %r12d // 4*sdb*sizeof(double)
+ movq ARG12, %r13 // m0
+ movq ARG13, %r14 // m1
+ movq ARG14, %r15 // n0
+ movq ARG15, %rax // n1
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_GEN_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_gen_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_gen_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nt_4x4_gen_lib4, .-kernel_dgemm_nt_4x4_gen_lib4
+#endif
+
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9
+// void kernel_dgemm_nt_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .type kernel_dgemm_nn_4x4_lib4, @function
+kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_lib4
+_kernel_dgemm_nn_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_lib4
+ .def kernel_dgemm_nn_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_lib4, .-kernel_dgemm_nn_4x4_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7 8 9 10 11
+// void kernel_dgemm_nt_4x4_vs_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_nn_4x4_vs_lib4
+ .type kernel_dgemm_nn_4x4_vs_lib4, @function
+kernel_dgemm_nn_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_nn_4x4_vs_lib4
+_kernel_dgemm_nn_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_nn_4x4_vs_lib4
+ .def kernel_dgemm_nn_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_nn_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG7, %r11 // beta
+ movq ARG8, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG9, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_nn_4x4_vs_lib4, .-kernel_dgemm_nn_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dsyrk_nt_l_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .type kernel_dsyrk_nt_l_4x4_lib4, @function
+kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_lib4
+_kernel_dsyrk_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_lib4
+ .def kernel_dsyrk_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_lib4, .-kernel_dsyrk_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_nt_l_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_nt_l_4x4_vs_lib4
+_kernel_dsyrk_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_nt_l_4x4_vs_lib4, .-kernel_dsyrk_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8
+// void kernel_dtrmm_nt_ru_4x4_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .type kernel_dtrmm_nt_ru_4x4_lib4, @function
+kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_lib4
+_kernel_dtrmm_nt_ru_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_lib4
+ .def kernel_dtrmm_nt_ru_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blend
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG3, %r10
+ movq ARG4, %r11
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_lib4, .-kernel_dtrmm_nt_ru_4x4_lib4
+#endif
+
+
+
+
+
+// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrmm_nt_ru_4x4_vs_lib4(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .type kernel_dtrmm_nt_ru_4x4_vs_lib4, @function
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nt_ru_4x4_vs_lib4
+_kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nt_ru_4x4_vs_lib4
+ .def kernel_dtrmm_nt_ru_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nt_ru_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt after initial triangle
+
+ movq ARG1, %r10 // k
+ subl $4, %r10d // k-4
+ movq ARG3, %r11 // A
+ addq $128, %r11 // A+4*bs
+ movq ARG4, %r12 // B
+ addq $128, %r12 // B+4*bs
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender nn
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_4x4_lib4
+#endif
+#endif
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG4, %r12 // B
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+#endif
+
+
+ // call inner loader nn
+
+ movq ARG2, %r10 // alpha
+ movq ARG5, %r11 // beta
+ movq ARG6, %r12 // C
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_AB_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_ab_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_ab_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nt_ru_4x4_vs_lib4, .-kernel_dtrmm_nt_ru_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9
+// void kernel_dpotrf_nt_l_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .type kernel_dpotrf_nt_l_4x4_lib4, @function
+kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_lib4
+_kernel_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_lib4
+ .def kernel_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movl $4, %r11d // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_lib4, .-kernel_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16
+// void kernel_dpotrf_nt_l_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG6, %r10 // inv_diag_D
+ movq ARG8, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG7, %r11 // km
+ movq ARG8, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movl $4, %r11d
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40
+// void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .type kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, @function
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+_kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+ .def kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // factorization
+
+ movq ARG9, %r10 // inv_diag_D
+ movq ARG11, %r11 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DPOTRF_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dpotrf_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dpotrf_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG10, %r11 // km
+ movq ARG11, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_L_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_l_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_l_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4, .-kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8
+// void kernel_dtrsm_nt_rl_inv_4x4_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG4, %r10
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24
+// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int k, double *A, double *B, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt
+
+ movq ARG1, %r10
+ movq ARG2, %r11
+ movq ARG3, %r12
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn // TODO scale gen
+
+ movq ARG4, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG6, %r10 // E
+ movq ARG7, %r11 // inv_diag_E
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG5, %r10 // D
+ movq ARG8, %r11 // km
+ movq ARG9, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48
+// void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .type kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, @function
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+_kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+ .def kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4; .scl 2; .type 32; .endef
+kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+ // call inner dgemm kernel nt add
+
+ movq ARG1, %r10 // kp
+ movq ARG2, %r11 // Ap
+ movq ARG3, %r12 // Bp
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner dgemm kernel nt sub
+
+ movq ARG4, %r10 // km
+ movq ARG5, %r11 // Am
+ movq ARG6, %r12 // Bm
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_SUB_NT_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_sub_nt_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_sub_nt_4x4_lib4
+#endif
+#endif
+
+
+ // call inner blender_loader nn
+
+ movq ARG7, %r10 // C
+
+#if MACRO_LEVEL>=1
+ INNER_BLEND_SCALE_11_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_blend_scale_11_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_blend_scale_11_4x4_lib4
+#endif
+#endif
+
+
+ // solve
+
+ movq ARG9, %r10 // E
+ movq ARG10, %r11 // inv_diag_E
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrsm_rlt_inv_4x4_vs_lib4
+#endif
+#endif
+
+
+ // store
+
+ movq ARG8, %r10 // D
+ movq ARG11, %r11 // km
+ movq ARG12, %r12 // kn
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_VS_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_vs_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_vs_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4, .-kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4
+#endif
+
+
+
+
+
+// 1 2 3 4 5 6 7
+// void kernel_dtrmm_nn_rl_4x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+
+ .p2align 4,,15
+#if defined(OS_LINUX)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .type kernel_dtrmm_nn_rl_4x4_lib4, @function
+kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_MAC)
+ .globl _kernel_dtrmm_nn_rl_4x4_lib4
+_kernel_dtrmm_nn_rl_4x4_lib4:
+#elif defined(OS_WINDOWS)
+ .globl kernel_dtrmm_nn_rl_4x4_lib4
+ .def kernel_dtrmm_nn_rl_4x4_lib4; .scl 2; .type 32; .endef
+kernel_dtrmm_nn_rl_4x4_lib4:
+#endif
+
+ PROLOGUE
+
+ // zero accumulation registers
+
+ xorpd %xmm0, %xmm0
+ movapd %xmm0, %xmm1
+ movapd %xmm0, %xmm2
+ movapd %xmm0, %xmm3
+ movapd %xmm0, %xmm4
+ movapd %xmm0, %xmm5
+ movapd %xmm0, %xmm6
+ movapd %xmm0, %xmm7
+
+
+
+ // initial triangle
+
+ movq ARG1, %r10 // k
+ movq ARG3, %r11 // A
+ movq ARG5, %r12 // B
+ movq ARG6, %r13 // sdb
+ sall $5, %r13d // 4*sdb*sizeof(double)
+ movq ARG4, %r14 // offsetB
+
+#if MACRO_LEVEL>=1
+ INNER_EDGE_DTRMM_NN_RL_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_edge_dtrmm_nn_rl_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_edge_dtrmm_nn_rl_4x4_lib4
+#endif
+#endif
+
+ // call inner dgemm kernel nt after initial triangle
+
+#if MACRO_LEVEL>=2
+ INNER_KERNEL_DGEMM_ADD_NN_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_kernel_dgemm_add_nn_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_kernel_dgemm_add_nn_4x4_lib4
+#endif
+#endif
+
+
+ // call inner scale
+
+ movq ARG2, %r10 // alpha
+
+#if MACRO_LEVEL>=1
+ INNER_SCALE_A0_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_scale_a0_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_scale_a0_4x4_lib4
+#endif
+#endif
+
+
+ // store n
+
+ movq ARG7, %r10 // D
+
+#if MACRO_LEVEL>=1
+ INNER_STORE_4X4_LIB4
+#else
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ call inner_store_4x4_lib4
+#elif defined(OS_MAC)
+ callq _inner_store_4x4_lib4
+#endif
+#endif
+
+
+ EPILOGUE
+
+ ret
+
+#if defined(OS_LINUX)
+ .size kernel_dtrmm_nn_rl_4x4_lib4, .-kernel_dtrmm_nn_rl_4x4_lib4
+#endif
+
+
+
+
+
+ // read-only data
+#if defined(OS_LINUX)
+ .section .rodata.cst32,"aM",@progbits,32
+#elif defined(OS_MAC)
+ .section __TEXT,__const
+#elif defined(OS_WINDOWS)
+ .section .rdata,"dr"
+#endif
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC00: // { -1 -1 -1 1 }
+#elif defined(OS_MAC)
+LC00: // { -1 -1 -1 1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad 1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC01: // { -1 -1 -1 -1 }
+#elif defined(OS_MAC)
+LC01: // { -1 -1 -1 -1 }
+ .align 5
+#endif
+ .quad -1
+ .quad -1
+ .quad -1
+ .quad -1
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC02: // { 3.5 2.5 1.5 0.5 }
+#elif defined(OS_MAC)
+LC02: // { 3.5 2.5 1.5 0.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1071644672
+ .long 0
+ .long 1073217536
+ .long 0
+ .long 1074003968
+ .long 0
+ .long 1074528256
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC03: // { 7.5 6.5 5.5 4.5 }
+#elif defined(OS_MAC)
+LC03: // { 7.5 6.5 5.5 4.5 }
+ .align 5
+#endif
+ .long 0
+ .long 1074921472
+ .long 0
+ .long 1075183616
+ .long 0
+ .long 1075445760
+ .long 0
+ .long 1075707904
+
+#if defined(OS_LINUX) | defined(OS_WINDOWS)
+ .align 32
+.LC04: // { 1.0 1.0 1.0 1.0 }
+#elif defined(OS_MAC)
+LC04: // { 1.0 1.0 1.0 1.0 }
+ .align 5
+#endif
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+ .long 0
+ .long 1072693248
+
+
+
+#if defined(OS_LINUX)
+ .section .note.GNU-stack,"",@progbits
+#elif defined(OS_MAC)
+ .subsections_via_symbols
+#endif
+